Python /美丽的汤。网址列表->解析->将数据提取到csv。错误

时间:2019-07-03 07:20:15

标签: python python-3.x web-scraping beautifulsoup

我在csv文件中有一个网址列表(我可以在本地计算机上或在线托管该文件)。我需要从列表中的网页中提取biz名称,地址和电话号码。我有所有正确的班级名称。我想使用上述列将数据提取到csv中。

来自csv:

https://slicelife.com/restaurants/wi/milwaukee/53211/pizza-man-milwaukee/menu
https://slicelife.com/restaurants/nj/northvale/7647/three-boys-from-italy-northvale/menu
https://slicelife.com/restaurants/mn/mankato/56001/jake-s-stadium-pizza/menu
https://slicelife.com/restaurants/pa/new-brighton/15066/bakers-buck-hut/menu

那呢:


from bs4 import BeautifulSoup
import requests
import json
import csv
from urllib.request import urlopen
import requests



with open('aliveSlice.csv', 'r') as csvUrls_list:
    csv_reader = csv.DictReader(csvUrls_list)
    for row in csv_reader:

        url = (print (row))

        # Collect first page of menu
        page = requests.get(url)

            # Collect first page of menu
        response = requests.get(url)
        html = response.content

        # Create a BeautifulSoup object
        soup = BeautifulSoup(page.text, 'html.parser')

        # Pull all text from the f19xeu2d div
        pizzaPage = soup.find(class_='f19xeu2d')

        # Pull text from all instances of tags within associated divs
        restaurantNames = pizzaPage.find_all('f13p7rsj'),
        address = pizzaPage.find_all('f1lfckhr'),
        phoneNumber = pizzaPage.find_all('f12gt8lx'),

        pizzeriaObject = {
            restaurantNames,
            address,
            phoneNumber
            }
        # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'), 
        # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
        # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
        # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),

        # print (pizzaPage)
        # Create for loop to print out all restaurants' names





        with open('scrapedBiz.csv', 'w') as new_file:
            fieldnames = ['url', 'Raw Data', 'Business Name', 'Address', 'Phone'] 

            csv_writer =csv.DictWriter(new_file, fieldnames=fieldnames, delimiter=',') 

            csv_writer.writeheader()

            for line in csv_reader: 
                csv_writer.writerow(pizzeriaObject)





# # TrattoriArray = []
# # with open('aliveSlice.csv','r') as csvf: # Open file in read mode
    # # urls = csv.reader(csvf)
    # # for url in urls:
        # # TrattoriArray.append(url) # Add each url to list contents

# # for url in TrattoriArray:  # Parse through each url in the list.
    # # page = urlopen(url[0]).read()
    # # content = BeautifulSoup(page.content, "html.parser")

# # pizzaArray = []
# # for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
    # # pizzeriaObject = {
        # # "pizzeriaName": pizzeria.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
        # # "address": pizzeria.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
        # # "phoneNumber": pizzeria.find('rc-c2d-number', attrs={"span": "rc-c2d-number"}).text.encode('utf-8'),

    # # }
    # # pizzaArray.append(pizzeriaObject)
# # with open('pizzeriaData.json', 'w') as outfile:
    # # json.dump(pizzaArray, outfile)





# # # from bs4 import BeautifulSoup
# # # import requests
# # # import json
# # # import csv
# # # from urllib.request import urlopen
# # # import urllib2
# # # import re

# # # urls = csv.reader(open('aliveSlice.csv'))
# # # for url in urls:
    # # # response = urllib2.urlopen(url[0])
    # # # html = response.read()
    # # # # print re.findall('f19xeu2d',html)
    # # # content = BeautifulSoup(f19xeu2d.content, "html.parser")

# # # # url = 'https://slicelife.com/restaurants/fl/west-palm-bea/33406/albertos-pizzeria/menu'
# # # # response = requests.get(url, timeout=5)


# # # # TrattoriArray = []
# # # # with open('aliveSlice.csv','r') as csvf: # Open file in read mode
    # # # # urls = csv.reader(csvf)
    # # # # for url in urls:
        # # # # TrattoriArray.append(url) # Add each url to list contents

# # # # for url in TrattoriArray:  # Parse through each url in the list.
    # # # # page = urlopen(url[0]).read()
    # # # # content = BeautifulSoup(page.content, "html.parser")

# # # # pizzaArray = []
# # # # for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
    # # # # pizzaArray.append(pizzeriaObject)
# # # # with open('pizzeriaData.json', 'w') as outfile:
    # # # # json.dump(pizzaArray, outfile)


# # # htmlContent = response.content
# # # soup = BeautifulSoup(htmlContent, features="html.parser")


# # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'), 
# # # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# # # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),


# # # #print(soup.prettify())


# # # pizzeriaObject = {
# # # (
# # # pizzeriaName
# # # +phoneNumber
# # # +address
# # # )
# # # }


# # # print(pizzeriaObject)
# # # print(pizzeriaName)
# # # print(phoneNumber)
# # # print(address)

# # # # import requests
# # # # from bs4 import BeautifulSoup
# # # # import csv

# # # # with open('aliveSlice.csv', newline='') as f_urls, open('output.csv', 'w', newline='') as f_output:
    # # # # csv_urls = csv.reader(f_urls)
    # # # # csv_output = csv.writer(f_output)
    # # # # csv_output.writerow(['locationRawData' , 'pizzeriaName' , 'address', 'Phone'])
    # # # # csv_output.writerow(['Ngoname', 'CEO', 'City', 'Address', 'Phone', 'Mobile', 'E-mail'])

    # # # # for line in csv_urls:
        # # # # r = requests.get(line[0]).text
        # # # # soup = BeautifulSoup(r, "html.parser")

# # # # locationRawData = soup.find('h1')
# # # # print('RAW :', locationRawData.text)
# # # f13p7rsj
# # # # pizzeriaName = soup.find('h1', class_='f13p7rsj')
# # # # print('pizzeriaName:', pizzeriaName[1].text)

# # # ###########
        # # # ngoname = soup.find('h1')
        # # # print('NGO Name :', ngoname.text)

        # # # pizzeriaName = soup.find('h1', class_='').text
        # # # ceo_name = ceo.split(':')
        # # # print('CeoName:', ceo_name[1])

        # # # city = soup.find_all('span')
        # # # print('City :', city[5].text)

        # # # addressBiz = soup.find_all('address')
        # # # print('Address :', addressBiz[6].text)

        # # # phoneNumber = soup.find_all('button')
        # # # print('Phone :', phoneNumber[7].text)

        # # # mobile = soup.find_all('span')
        # # # print('Mobile :', mobile[8].text)

        # # # email = soup.find_all('span')
        # # # print('Email_id :', email[9].text)

        # # # csv_output.writerow([ngoname.text, ceo_name[1], city[5].text, address[6].text, phone[7].text, mobile[8].text, email[9].text])


# # # locationRawData = soup.find('h1')
# # # print('RAW :', locationRawData.text)

# # # pizzeriaName = soup.find('h1', class_='f13p7rsj')
# # # # pizzeria_name = pizzeriaName.split(':')
# # # print('pizzeriaName:', pizzeriaName[0])

# # # address = soup.find('address', class_='f1lfckhr')
# # # print('Address :', address[1].text)

# # # phoneNumber = soup.find('button', class_='f12gt8lx')
# # # print('Phone :', phoneNumber[2].text)

# # # locationRawData = soup.find('div', class_='f19xeu2d')
# # # print('RAW :', locationRawData[3].text)
# # # #############

# # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'), 
# # # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# # # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),

# # # # address = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8')
# # # # print('Address :', address[2].text)

# # # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8')
# # # # print('Phone :', phoneNumber[3].text)

# # # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8')
# # # # print('RAW :', locationRawData[4].text)



# # # # csv_output.writerow([locationRawData.text, pizzeria_name[1], address[2].text, phoneNumber[3].text, locationRawData[4].text])


1 个答案:

答案 0 :(得分:0)

您获取电话号码的代码引起了问题,在尝试从中获取文本之前,应测试返回的对象是否有效。如果要CSV输出,可以使用csv.writer()

from bs4 import BeautifulSoup
import requests
import csv


with open('aliveSlice.csv', newline='') as f_input, open('output.csv', 'w', newline='', encoding='utf-8') as f_output:
    csv_input = csv.reader(f_input)
    csv_output = csv.writer(f_output)
    csv_output.writerow(["url", "name", "address", "phone"])

    for row in csv_input:
        url = row[0]
        req = requests.get(url)
        content = BeautifulSoup(req.content, "html.parser")

        for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
            name = pizzeria.find('h1', attrs={"class": "f13p7rsj"})
            address = pizzeria.find('address', attrs={"class": "f1lfckhr"})
            phone = pizzeria.find('button', attrs={"class": "f12gt8lx"})

            if name and address and phone:
                csv_output.writerow([url, name.text, address.text, phone.text])
            else:
                print(f"Missing data - {url}")

为您提供以下类型的CSV输出:

url,name,address,phone
https://slicelife.com/restaurants/wi/milwaukee/53211/pizza-man-milwaukee/menu,Pizza Man Milwaukee,"2597 N Downer Ave, Milwaukee, WI 53211",414-622-1034
https://slicelife.com/restaurants/nj/northvale/7647/three-boys-from-italy-northvale/menu,Three Boys From Italy,"238 Livingston St, Northvale, NJ 7647",201-879-0152
https://slicelife.com/restaurants/mn/mankato/56001/jake-s-stadium-pizza/menu,Jake's Stadium Pizza,"330 Stadium Rd, Mankato, MN 56001",507-225-7978
https://slicelife.com/restaurants/pa/new-brighton/15066/bakers-buck-hut/menu,Bakers Buck Hut,"1103 Route 68, New Brighton, PA 15066",724-521-4028

还请注意,如果您使用split(':')并且文本不包含:,那么您将返回仅包含一项的列表。因此,在其上使用[1]将会失败。不过,您可以改而使用[-1]返回过去。