Question

我是网络抓取的新手。我正在从一个网页上抓取数据，我抓了第一页href，然后我去每个href并找到'address-data'中的'p标签'。我想存储一个url'p tag'数据在一行中，第二个url'p tag'标记在第二行。我的数据附加在'myUrl'中。我想在csv文件中保存数据，例如地址，经度，电话，电子邮件然后新行开始

这是我的代码：

from bs4 import BeautifulSoup
import requests
import csv

myUrl=[]
urls = ["http://www.shaditayari.pk/s&category=326&location=266&a=true&paged{}".format(i) for i in range(1, 10)]  # make a url list and iterate over it
for url in urls:
    r = requests.get(url)
    print('idr1')
    soup = BeautifulSoup(r.text, "html.parser")
    for link in soup.find_all('a', {'main-link'}):
            iurl=link.get('href')  
            r = requests.get(iurl)
            print(iurl)
            soup = BeautifulSoup(r.content, "lxml")
            with open ('lhr.cv','wb') as file:
                divs = soup.find_all('div',attrs={"class":"address-data"})
                for div in divs:
                    myUrl.append(div.find('p').text)
                    #print(myUrl)
                    with open ('lhr.cv','w') as file:
                        writer=csv.writer(file)
                        for row in myUrl:
                                writer.writerow(row)

预期产出：

9 Fane Road، Lahore 54000, Pakistan|1.561381309140028|74.31484723624567|042-37363901-9|gm@bestwesternlahore.com/sales@bestwesternlahore.com/  reservations@bestwesternlahore.com
1/E-3, Main Boulevard Gulberg III, Lahore|31.525700029363|74.34930089283|0305-2960614|https://www.facebook.com/pages/Zauk-Banquet-Hall/204612846290857

Answer 1

我已经在Python 2中使用xpath编写了这个（因为我认为它们更简洁，更简单，可用于webscraping），但是这段代码会为您提供链接列表：

#Load required libraries
import requests
from lxml import html
import pandas as pd

#Create base URL
url = "http://www.shaditayari.pk/?s&post_type=ait-item&a=true&paged="

#First, we want to work out the number of pages to scrape. We load any page and get the largest page number
page = requests.get(url+str(1))
tree = html.fromstring(page.content)
no_pages = tree.xpath("//nav/a[last()]/text()")[0] #This comes out as a list of two - we only want the first one

#Next, we want to scrape the links to each page with the address

links = []
names = []

for i in range(1,int(no_pages)+1):
    page = requests.get(url+str(i))
    tree = html.fromstring(page.content)
    page_links = tree.xpath("//div[@class = 'item-title']/a/@href")
    page_names = tree.xpath("//a/h3/text()")
    links = links + page_links
    names = names + page_names
    print i

address links = {"Name": names,
                "URL": links}

pd.DataFrame.to_csv(u"address_links.csv")

此代码需要完成，append，字典完成以及创建CSV的行，但它会获取您的详细信息：

address_list = []
latitude_list = []
longitude_list = []
telephone_list = []
email_list = []
webpage_list = []

counter = 0

for url in address_links["URL"]:
    page = requests.get("http://www.shaditayari.pk/businesses/rizwan-beyg/")
    tree = html.fromstring(page.content)
    address = tree.xpath("//div[@itemprop = 'streetAddress']/p/text()")
    if len(address) == 0:
        address == ""
    else:
        address == address[0]

    latitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
    if len(latitude) == 0:
        latitude = ""
    else:
        latitude = latitude[0]

    longitude = tree.xpath("//p/meta[@itemprop = 'latitude']/@content")
    if len(longitude) == 0:
        longitude = ""
    else:
        longitude = longitude[0]

    telephone = tree.xpath("//a[@class = 'phone']/text()")
    if len(telephone) == 0:
        telephone = ""
    else:
        telephone = telephone[0]

    email = tree.xpath("//a[@itemprop = 'email']/text()")
    if len(email) == 0:
        email = ""
    else:
        email = email[0]

    webpage = tree.xpath("//a[@itemprop = 'url']/@href")
    if len(webpage) == 0:
        webpage = ""
    else:
        webpage = webpage[0]

    address_list.append(address)
    #continue for others

    counter+=1
    print counter

address_details = {"Name": names,
                  "URL": links,
                  "Address": address_list,
                   #continue for others
                  }

在将其转换为CSV之前，您可能需要添加一些unicode编码。那回答here。

如何在csv文件中添加废料数据？

1 个答案: