尝试使用tripadvisor scraper来提高我的python技能。 目前,刮刀可以刮取城市中的餐馆并将名称和tripadvisor URL保存在excel文件中。但是,我正在寻找建议,以节省餐厅电子邮件和直接URL。
任何人都可以为此提供任何输入吗? 干杯
import requests
from tkinter import *
from bs4 import BeautifulSoup as b
from bs4 import Comment as com
from openpyxl import Workbook
# city_name = 'London_England'
# geo_code = '186338'
def o_and_t():
global nameFile, geo_code, city_name
nameFile = e_1.get() + '.xlsx'
geo_code = e_2.get()
city_name = e_3.get()
root.destroy()
return None
def gui():
global root,e_1,e_2,e_3
root = Tk()
root.geometry('500x230')
root.configure(bg = 'black')
root.title('Enter Details')
#
l_0 = Label(root,text = '\t\tTripAdvisorScraper\n \t\t ~by a1b2t',font = ("Helevetica",14),bg = 'black',fg = 'white')
l_0.place(x = 0 ,y = 0 )
#
l_1 = Label(root,text = 'Please Enter the FileName : ',font = ("Helevetica",11),bg = 'black',fg = 'white')
l_1.place(x = 0 , y = 60)
#
l_2 = Label(root,text = 'Please enter the code from the url : ',font = ("Helevetica",11),bg = 'black',fg = 'white')
l_2.place(x = 0 , y = 90)
#
l_3 = Label(root,text = 'Please enter the city and country as in url :',font = ("Helevetica",12),bg = 'black',fg = 'white')
l_3.place(x = 0,y = 120)
#
e_1 = Entry(root)
e_1.place(x = 320 ,y = 60)
#
e_2 = Entry(root)
e_2.place(x = 320 ,y = 90)
#
e_3 = Entry(root)
e_3.place(x = 320 ,y = 120)
#
b_1 = Button(root,text = 'START',command = o_and_t)
b_1.place(x = 220 ,y = 170)
root.mainloop()
return None
gui()
print('\n\n\tStarting Scraper\t\n\n')
main_url = 'https://www.tripadvisor.co.uk/Restaurants-g{}-{}.html'.format(geo_code, city_name)
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a150&availSearchEnabled=false'
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a60&availSearchEnabled=false'
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a30&availSearchEnabled=false'
req_1 = requests.get(main_url)
soup = b(req_1.content, 'html.parser')
total_pages = int(soup.find_all('a', class_="pageNum taLnk")[-1]['data-page-number']) + 1
print(total_pages)
RESULTS = []
for page_no in range(0, total_pages*30 , 30):
page_no
url = 'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo={}&ajax=1&itags=10591&sortOrder=relevance&o=a{}&availSearchEnabled=false'.format(geo_code, page_no)
req_2 = requests.get(url)
soup_2 = b(req_2.content, 'html.parser')
temp = soup_2.find_all('a', class_="property_title")
for t in temp:
r_name = t.text.replace('\n', '').replace('\t', '')
r_url = 'https://www.tripadvisor.com' + t['href']
print(str([r_name, r_url]).encode())
RESULTS.append([r_name, r_url])
if len(RESULTS) !=0:
wb = Workbook(write_only=True)
ws = wb.create_sheet()
for steps_0 in RESULTS:
ws.append(steps_0)
wb.save(nameFile)
print(len(RESULTS))
答案 0 :(得分:1)
你应该抓住你抓住的每个网址并查找element class =“detail_section info”