我是网络抓取的新手。我正在从一个网页上抓取数据,我抓了第一页href,然后我去每个href并找到'address-data'中的'p标签'。我想存储一个url'p tag'数据在一行中,第二个url'p tag'标记在第二行。我的数据附加在'myUrl'中。我想在csv文件中保存数据,例如地址,经度,电话,电子邮件然后新行开始
这是我的代码:
from bs4 import BeautifulSoup
import requests
import csv
myUrl=[]
urls = ["http://www.shaditayari.pk/s&category=326&location=266&a=true&paged{}".format(i) for i in range(1, 10)] # make a url list and iterate over it
for url in urls:
r = requests.get(url)
print('idr1')
soup = BeautifulSoup(r.text, "html.parser")
for link in soup.find_all('a', {'main-link'}):
iurl=link.get('href')
r = requests.get(iurl)
print(iurl)
soup = BeautifulSoup(r.content, "lxml")
with open ('lhr.cv','wb') as file:
divs = soup.find_all('div',attrs={"class":"address-data"})
for div in divs:
myUrl.append(div.find('p').text)
#print(myUrl)
with open ('lhr.cv','w') as file:
writer=csv.writer(file)
for row in myUrl:
writer.writerow(row)
预期产出:
9 Fane Road، Lahore 54000, Pakistan|1.561381309140028|74.31484723624567|042-37363901-9|gm@bestwesternlahore.com/sales@bestwesternlahore.com/ reservations@bestwesternlahore.com
1/E-3, Main Boulevard Gulberg III, Lahore|31.525700029363|74.34930089283|0305-2960614|https://www.facebook.com/pages/Zauk-Banquet-Hall/204612846290857
答案 0 :(得分:0)
我已经在Python 2中使用xpath编写了这个(因为我认为它们更简洁,更简单,可用于webscraping),但是这段代码会为您提供链接列表:
#Load required libraries
import requests
from lxml import html
import pandas as pd
#Create base URL
url = "http://www.shaditayari.pk/?s&post_type=ait-item&a=true&paged="
#First, we want to work out the number of pages to scrape. We load any page and get the largest page number
page = requests.get(url+str(1))
tree = html.fromstring(page.content)
no_pages = tree.xpath("//nav/a[last()]/text()")[0] #This comes out as a list of two - we only want the first one
#Next, we want to scrape the links to each page with the address
links = []
names = []
for i in range(1,int(no_pages)+1):
page = requests.get(url+str(i))
tree = html.fromstring(page.content)
page_links = tree.xpath("//div[@class = 'item-title']/a/@href")
page_names = tree.xpath("//a/h3/text()")
links = links + page_links
names = names + page_names
print i
address links = {"Name": names,
"URL": links}
pd.DataFrame.to_csv(u"address_links.csv")
此代码需要完成,append
,字典完成以及创建CSV的行,但它会获取您的详细信息:
address_list = []
latitude_list = []
longitude_list = []
telephone_list = []
email_list = []
webpage_list = []
counter = 0
for url in address_links["URL"]:
page = requests.get("http://www.shaditayari.pk/businesses/rizwan-beyg/")
tree = html.fromstring(page.content)
address = tree.xpath("//div[@itemprop = 'streetAddress']/p/text()")
if len(address) == 0:
address == ""
else:
address == address[0]
latitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
if len(latitude) == 0:
latitude = ""
else:
latitude = latitude[0]
longitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
if len(longitude) == 0:
longitude = ""
else:
longitude = longitude[0]
telephone = tree.xpath("//a[@class = 'phone']/text()")
if len(telephone) == 0:
telephone = ""
else:
telephone = telephone[0]
email = tree.xpath("//a[@itemprop = 'email']/text()")
if len(email) == 0:
email = ""
else:
email = email[0]
webpage = tree.xpath("//a[@itemprop = 'url']/@href")
if len(webpage) == 0:
webpage = ""
else:
webpage = webpage[0]
address_list.append(address)
#continue for others
counter+=1
print counter
address_details = {"Name": names,
"URL": links,
"Address": address_list,
#continue for others
}
在将其转换为CSV之前,您可能需要添加一些unicode编码。那回答here。