tripadvisor刮取餐馆的URL和电子邮件

时间:2017-05-28 16:29:07

标签: python web-scraping

尝试使用tripadvisor scraper来提高我的python技能。 目前,刮刀可以刮取城市中的餐馆并将名称和tripadvisor URL保存在excel文件中。但是,我正在寻找建议,以节省餐厅电子邮件和直接URL。

任何人都可以为此提供任何输入吗? 干杯

import requests

from tkinter import *
from bs4 import BeautifulSoup as b
from bs4 import Comment as com
from openpyxl import Workbook
# city_name = 'London_England'
# geo_code = '186338'


def o_and_t():
	global nameFile, geo_code, city_name
	nameFile = e_1.get() + '.xlsx'
	geo_code = e_2.get()
	city_name = e_3.get()
	root.destroy()
	return None

def gui():
	global root,e_1,e_2,e_3
	root = Tk()
	root.geometry('500x230')
	root.configure(bg = 'black')
	root.title('Enter Details')
	#
	l_0 = Label(root,text = '\t\tTripAdvisorScraper\n \t\t ~by a1b2t',font = ("Helevetica",14),bg = 'black',fg = 'white')
	l_0.place(x = 0 ,y = 0 )
	#
	l_1 = Label(root,text = 'Please Enter the FileName : ',font = ("Helevetica",11),bg = 'black',fg = 'white')
	l_1.place(x = 0 , y = 60)
	#
	l_2 = Label(root,text = 'Please enter the code from the url : ',font = ("Helevetica",11),bg = 'black',fg = 'white')
	l_2.place(x = 0 , y = 90)
	#
	l_3 = Label(root,text = 'Please enter the city and country as in url :',font = ("Helevetica",12),bg = 'black',fg = 'white')
	l_3.place(x = 0,y = 120)
	#
	e_1 = Entry(root)
	e_1.place(x = 320 ,y = 60)
	#
	e_2 = Entry(root)
	e_2.place(x = 320 ,y = 90)
	#
	e_3 = Entry(root)
	e_3.place(x = 320 ,y = 120)
	#
	b_1 = Button(root,text = 'START',command = o_and_t)
	b_1.place(x = 220 ,y = 170)
	root.mainloop()
	return None


gui()

print('\n\n\tStarting Scraper\t\n\n')



main_url = 'https://www.tripadvisor.co.uk/Restaurants-g{}-{}.html'.format(geo_code, city_name)

'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a150&availSearchEnabled=false'
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a60&availSearchEnabled=false'
'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo=186338&ajax=1&itags=10591&sortOrder=relevance&o=a30&availSearchEnabled=false'

req_1 = requests.get(main_url)
soup = b(req_1.content, 'html.parser')
total_pages = int(soup.find_all('a', class_="pageNum taLnk")[-1]['data-page-number']) + 1
print(total_pages)
RESULTS = []

for page_no in range(0, total_pages*30 , 30):
	page_no
	url = 'https://www.tripadvisor.co.uk/RestaurantSearch?Action=PAGE&geo={}&ajax=1&itags=10591&sortOrder=relevance&o=a{}&availSearchEnabled=false'.format(geo_code, page_no)
	req_2 = requests.get(url)
	soup_2 = b(req_2.content, 'html.parser')
	temp = soup_2.find_all('a', class_="property_title")
	for t in temp:
		r_name = t.text.replace('\n', '').replace('\t', '') 

		r_url = 'https://www.tripadvisor.com' + t['href']
		print(str([r_name, r_url]).encode()) 
		RESULTS.append([r_name, r_url])


if len(RESULTS) !=0:		
	wb = Workbook(write_only=True)
	ws = wb.create_sheet()
	for steps_0 in RESULTS:
		ws.append(steps_0)
	wb.save(nameFile)
print(len(RESULTS))
	
	

1 个答案:

答案 0 :(得分:1)

你应该抓住你抓住的每个网址并查找element class =“detail_section info”