我在csv文件中有一个网址列表(我可以在本地计算机上或在线托管该文件)。我需要从列表中的网页中提取biz名称,地址和电话号码。我有所有正确的班级名称。我想使用上述列将数据提取到csv中。
来自csv:
https://slicelife.com/restaurants/wi/milwaukee/53211/pizza-man-milwaukee/menu
https://slicelife.com/restaurants/nj/northvale/7647/three-boys-from-italy-northvale/menu
https://slicelife.com/restaurants/mn/mankato/56001/jake-s-stadium-pizza/menu
https://slicelife.com/restaurants/pa/new-brighton/15066/bakers-buck-hut/menu
那呢:
from bs4 import BeautifulSoup
import requests
import json
import csv
from urllib.request import urlopen
import requests
with open('aliveSlice.csv', 'r') as csvUrls_list:
csv_reader = csv.DictReader(csvUrls_list)
for row in csv_reader:
url = (print (row))
# Collect first page of menu
page = requests.get(url)
# Collect first page of menu
response = requests.get(url)
html = response.content
# Create a BeautifulSoup object
soup = BeautifulSoup(page.text, 'html.parser')
# Pull all text from the f19xeu2d div
pizzaPage = soup.find(class_='f19xeu2d')
# Pull text from all instances of tags within associated divs
restaurantNames = pizzaPage.find_all('f13p7rsj'),
address = pizzaPage.find_all('f1lfckhr'),
phoneNumber = pizzaPage.find_all('f12gt8lx'),
pizzeriaObject = {
restaurantNames,
address,
phoneNumber
}
# locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'),
# pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),
# print (pizzaPage)
# Create for loop to print out all restaurants' names
with open('scrapedBiz.csv', 'w') as new_file:
fieldnames = ['url', 'Raw Data', 'Business Name', 'Address', 'Phone']
csv_writer =csv.DictWriter(new_file, fieldnames=fieldnames, delimiter=',')
csv_writer.writeheader()
for line in csv_reader:
csv_writer.writerow(pizzeriaObject)
# # TrattoriArray = []
# # with open('aliveSlice.csv','r') as csvf: # Open file in read mode
# # urls = csv.reader(csvf)
# # for url in urls:
# # TrattoriArray.append(url) # Add each url to list contents
# # for url in TrattoriArray: # Parse through each url in the list.
# # page = urlopen(url[0]).read()
# # content = BeautifulSoup(page.content, "html.parser")
# # pizzaArray = []
# # for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
# # pizzeriaObject = {
# # "pizzeriaName": pizzeria.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# # "address": pizzeria.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# # "phoneNumber": pizzeria.find('rc-c2d-number', attrs={"span": "rc-c2d-number"}).text.encode('utf-8'),
# # }
# # pizzaArray.append(pizzeriaObject)
# # with open('pizzeriaData.json', 'w') as outfile:
# # json.dump(pizzaArray, outfile)
# # # from bs4 import BeautifulSoup
# # # import requests
# # # import json
# # # import csv
# # # from urllib.request import urlopen
# # # import urllib2
# # # import re
# # # urls = csv.reader(open('aliveSlice.csv'))
# # # for url in urls:
# # # response = urllib2.urlopen(url[0])
# # # html = response.read()
# # # # print re.findall('f19xeu2d',html)
# # # content = BeautifulSoup(f19xeu2d.content, "html.parser")
# # # # url = 'https://slicelife.com/restaurants/fl/west-palm-bea/33406/albertos-pizzeria/menu'
# # # # response = requests.get(url, timeout=5)
# # # # TrattoriArray = []
# # # # with open('aliveSlice.csv','r') as csvf: # Open file in read mode
# # # # urls = csv.reader(csvf)
# # # # for url in urls:
# # # # TrattoriArray.append(url) # Add each url to list contents
# # # # for url in TrattoriArray: # Parse through each url in the list.
# # # # page = urlopen(url[0]).read()
# # # # content = BeautifulSoup(page.content, "html.parser")
# # # # pizzaArray = []
# # # # for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
# # # # pizzaArray.append(pizzeriaObject)
# # # # with open('pizzeriaData.json', 'w') as outfile:
# # # # json.dump(pizzaArray, outfile)
# # # htmlContent = response.content
# # # soup = BeautifulSoup(htmlContent, features="html.parser")
# # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'),
# # # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# # # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),
# # # #print(soup.prettify())
# # # pizzeriaObject = {
# # # (
# # # pizzeriaName
# # # +phoneNumber
# # # +address
# # # )
# # # }
# # # print(pizzeriaObject)
# # # print(pizzeriaName)
# # # print(phoneNumber)
# # # print(address)
# # # # import requests
# # # # from bs4 import BeautifulSoup
# # # # import csv
# # # # with open('aliveSlice.csv', newline='') as f_urls, open('output.csv', 'w', newline='') as f_output:
# # # # csv_urls = csv.reader(f_urls)
# # # # csv_output = csv.writer(f_output)
# # # # csv_output.writerow(['locationRawData' , 'pizzeriaName' , 'address', 'Phone'])
# # # # csv_output.writerow(['Ngoname', 'CEO', 'City', 'Address', 'Phone', 'Mobile', 'E-mail'])
# # # # for line in csv_urls:
# # # # r = requests.get(line[0]).text
# # # # soup = BeautifulSoup(r, "html.parser")
# # # # locationRawData = soup.find('h1')
# # # # print('RAW :', locationRawData.text)
# # # f13p7rsj
# # # # pizzeriaName = soup.find('h1', class_='f13p7rsj')
# # # # print('pizzeriaName:', pizzeriaName[1].text)
# # # ###########
# # # ngoname = soup.find('h1')
# # # print('NGO Name :', ngoname.text)
# # # pizzeriaName = soup.find('h1', class_='').text
# # # ceo_name = ceo.split(':')
# # # print('CeoName:', ceo_name[1])
# # # city = soup.find_all('span')
# # # print('City :', city[5].text)
# # # addressBiz = soup.find_all('address')
# # # print('Address :', addressBiz[6].text)
# # # phoneNumber = soup.find_all('button')
# # # print('Phone :', phoneNumber[7].text)
# # # mobile = soup.find_all('span')
# # # print('Mobile :', mobile[8].text)
# # # email = soup.find_all('span')
# # # print('Email_id :', email[9].text)
# # # csv_output.writerow([ngoname.text, ceo_name[1], city[5].text, address[6].text, phone[7].text, mobile[8].text, email[9].text])
# # # locationRawData = soup.find('h1')
# # # print('RAW :', locationRawData.text)
# # # pizzeriaName = soup.find('h1', class_='f13p7rsj')
# # # # pizzeria_name = pizzeriaName.split(':')
# # # print('pizzeriaName:', pizzeriaName[0])
# # # address = soup.find('address', class_='f1lfckhr')
# # # print('Address :', address[1].text)
# # # phoneNumber = soup.find('button', class_='f12gt8lx')
# # # print('Phone :', phoneNumber[2].text)
# # # locationRawData = soup.find('div', class_='f19xeu2d')
# # # print('RAW :', locationRawData[3].text)
# # # #############
# # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8'),
# # # pizzeriaName = soup.find('h1', attrs={"class": "f13p7rsj"}).text.encode('utf-8'),
# # # address = soup.find('address', attrs={"class": "f1lfckhr"}).text.encode('utf-8'),
# # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8'),
# # # # address = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8')
# # # # print('Address :', address[2].text)
# # # # phoneNumber = soup.find('button', attrs={"class": "f12gt8lx"}).text.encode('utf-8')
# # # # print('Phone :', phoneNumber[3].text)
# # # # locationRawData = soup.find('div', attrs={"class": "f19xeu2d"}).text.encode('utf-8')
# # # # print('RAW :', locationRawData[4].text)
# # # # csv_output.writerow([locationRawData.text, pizzeria_name[1], address[2].text, phoneNumber[3].text, locationRawData[4].text])
答案 0 :(得分:0)
您获取电话号码的代码引起了问题,在尝试从中获取文本之前,应测试返回的对象是否有效。如果要CSV输出,可以使用csv.writer()
:
from bs4 import BeautifulSoup
import requests
import csv
with open('aliveSlice.csv', newline='') as f_input, open('output.csv', 'w', newline='', encoding='utf-8') as f_output:
csv_input = csv.reader(f_input)
csv_output = csv.writer(f_output)
csv_output.writerow(["url", "name", "address", "phone"])
for row in csv_input:
url = row[0]
req = requests.get(url)
content = BeautifulSoup(req.content, "html.parser")
for pizzeria in content.findAll('div', attrs={"class": "f19xeu2d"}):
name = pizzeria.find('h1', attrs={"class": "f13p7rsj"})
address = pizzeria.find('address', attrs={"class": "f1lfckhr"})
phone = pizzeria.find('button', attrs={"class": "f12gt8lx"})
if name and address and phone:
csv_output.writerow([url, name.text, address.text, phone.text])
else:
print(f"Missing data - {url}")
为您提供以下类型的CSV输出:
url,name,address,phone
https://slicelife.com/restaurants/wi/milwaukee/53211/pizza-man-milwaukee/menu,Pizza Man Milwaukee,"2597 N Downer Ave, Milwaukee, WI 53211",414-622-1034
https://slicelife.com/restaurants/nj/northvale/7647/three-boys-from-italy-northvale/menu,Three Boys From Italy,"238 Livingston St, Northvale, NJ 7647",201-879-0152
https://slicelife.com/restaurants/mn/mankato/56001/jake-s-stadium-pizza/menu,Jake's Stadium Pizza,"330 Stadium Rd, Mankato, MN 56001",507-225-7978
https://slicelife.com/restaurants/pa/new-brighton/15066/bakers-buck-hut/menu,Bakers Buck Hut,"1103 Route 68, New Brighton, PA 15066",724-521-4028
还请注意,如果您使用split(':')
并且文本不包含:
,那么您将返回仅包含一项的列表。因此,在其上使用[1]
将会失败。不过,您可以改而使用[-1]
返回过去。