我正在尝试从'YP_LA_Remodel_urls.csv
文件中提取url(我在下面包括了几个),将它们抓取,然后将结果导出到Yp_LA_Remodel_Info.csv
。
如果我使用一个网址(不是来自csv)并将其抓取,那么它可以正常工作。它只是试图以大规模的方式做到这一点,而我正为此挂上电话。我已经创建了我需要提取的信息列表。
我正在使用我构建的另一爬网中的脚本,但这似乎不适用于该脚本。我是python noob,所以轻松一点。
任何帮助和/或建议都值得赞赏。
示例网址:
https://www.yellowpages.com/search?search_terms=remodeling&geo_location_terms=Los%20Angeles%2C%20CA&page=1
https://www.yellowpages.com/search?search_terms=remodeling&geo_location_terms=Los%20Angeles%2C%20CA&page=2
脚本:
import csv
from urllib.request import urlopen
import pandas as pd
from bs4 import BeautifulSoup
from email import encoders
import time
import os
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
import requests
def license_exists(soup):
contents = []
with open('YP_LA_Remodel_urls.csv','r') as csvf:
urls = csv.reader(csvf)
for url in urls:
if soup(class_="next ajax-page"):
return True
else:
return False
records = []
with open('YP_LA_Remodel_urls.csv') as f_input, open('Yp_LA_Remodel_Info.csv', 'w', newline='') as f_output:
csv_input = csv.reader(f_input)
csv_output = csv_output_to_csv(f_output, fieldnames=[name for name, result in records])
csv_output.writeheader()
for url in csv_input:
r = requests.get(url[0]) # Assume the URL is in the first column
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all('div', attrs={'class':'info'})
csv_output.to_csv('f_output', index=False, encoding='utf-8')
for result in results:
biz_name = result.find('span', attrs={'itemprop':'name'}).text if result.find('span', attrs={'itemprop':'name'}) is not None else ''
biz_phone = result.find('div', attrs={'itemprop':'telephone'}).text if result.find('span', attrs={'itemprop':'telephone'}) is not None else ''
biz_address = result.find('span', attrs={'itemprop':'streetAddress'}).text if result.find('span', attrs={'itemprop':'streetAddress'}) is not None else ''
biz_city = result.find('span', attrs={'itemprop':'addressLocality'}).text if result.find('span', attrs={'itemprop':'addressLocality'}) is not None else ''
biz_zip = result.find('span', attrs={'itemprop':'postalCode'}).text if result.find('span', attrs={'itemprop':'postalCode'}) is not None else ''
records.append((biz_name, biz_phone, biz_address, biz_city, biz_zip))
df = pd.DataFrame(records, columns=['biz_name', 'biz_phone', 'biz_address', 'biz_city', 'biz_zip'])
答案 0 :(得分:0)
这两个网址。修改为10000
import pandas as pd
import requests
from bs4 import BeautifulSoup
links = ['https://www.yellowpages.com/search?search_terms=remodeling&geo_location_terms=Los%20Angeles%2C%20CA&page=1',
'https://www.yellowpages.com/search?search_terms=remodeling&geo_location_terms=Los%20Angeles%2C%20CA&page=2']
container = pd.DataFrame(columns=['biz_name', 'biz_phone', 'biz_address', 'biz_city', 'biz_zip'])
pos=0
for l in links:
soup_data = BeautifulSoup(requests.get(l).content)
results = soup_data.find_all('div', attrs={'class':'info'})
records = []
for result in results:
records = []
biz_name = result.find('span', attrs={'itemprop':'name'}).text if result.find('span', attrs={'itemprop':'name'}) is not None else ''
biz_phone = result.find('div', attrs={'itemprop':'telephone'}).text if result.find('span', attrs={'itemprop':'telephone'}) is not None else ''
biz_address = result.find('span', attrs={'itemprop':'streetAddress'}).text if result.find('span', attrs={'itemprop':'streetAddress'}) is not None else ''
biz_city = result.find('span', attrs={'itemprop':'addressLocality'}).text if result.find('span', attrs={'itemprop':'addressLocality'}) is not None else ''
biz_zip = result.find('span', attrs={'itemprop':'postalCode'}).text if result.find('span', attrs={'itemprop':'postalCode'}) is not None else ''
records.append(biz_name)
records.append(biz_phone)
records.append(biz_address)
records.append(biz_city)
records.append(biz_zip)
container.loc[pos] = records
pos+=1
输出
biz_name biz_phone biz_address biz_city \
0
1
2 Washington Construction 2874 W 8th St Los Angeles,
3 Os Remodeling Inc. 220 N Avenue 53 Apt 202 Los Angeles,
4 A A Allied Construction 1212 S Longwood Ave Los Angeles,
biz_zip
0
1
2 90005
3 90042
4 90019
希望这会有所帮助!!