网址为Link-1。
行中有10行的网址与显示计数= 10,Link-2相同。
当我使用display count = 10进行刮擦时,我的代码正在更改页面,尽管它在第11页开始刮擦。但是当我尝试使用100显示计数时使用相同的代码时,它会在100处停止。这是尽管以下链接我只需将页面更改为= 1,在浏览器中显示101-200行。知道为什么我的代码不能刮掉。完整代码如下。提前谢谢。
Liink-3
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup as soup
import csv
outputfilename = 'ggw_LondonRoad10.csv'
inputfilename = 'ggw100_streets.txt'
#inputfilename = 'Edinburgh.txt'
outputfile = open(outputfilename, 'wb')
writer = csv.writer(outputfile)
writer.writerow(['URL', 'Local Council','Town','Locality','Street','Reference', 'Description', 'Address', 'Proprietor','Tenant','Occupier','Current Value','Pre-Revaluation 2010','Proposed Value 2017'])
base_url = 'https://www.saa.gov.uk'
session = requests.session()
#read input file
inputfile = open(inputfilename,'rb')
streets = inputfile.readlines()
for a_street in streets :
#ed_street=a_street.strip('\r\n')
ggw100_streets=a_street.strip('\r\n')
page_num = 0
url = 'https://www.saa.gov.uk/search/?SEARCHED=1&SEARCH_TABLE=valuation_roll_cpsplit&SEARCH_TERM=glasgow%2C'+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&PAGE='+str(page_num)+'&DISPLAY_COUNT=100&ASSESSOR_ID=&TYPE_FLAG=CPD&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=glasgow&DRILL_SEARCH_TERM='+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&DD_UNITARY_AUTHORITY=Glasgow+City&DD_TOWN=GLASGOW&DD_STREET='+ggw100_streets+'&DEBUG_LEVEL=0&SEARCH_METHOD=++&&PT=1#results'
response = session.get(url)
if 'postcode incorrectly' in response.content:
print 'Street unknown:', ggw100_streets
else:
empty_rslt= False
while not empty_rslt :
#url = 'https://www.saa.gov.uk/search/?SEARCHED=1&SEARCH_TABLE=valuation_roll_cpsplit&SEARCH_TERM=glasgow%2CLondon+Road%2C+GLASGOW%2C+Glasgow+City&PAGE='+str(page_num)+'&DISPLAY_COUNT=10&ASSESSOR_ID=&TYPE_FLAG=CPD&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=glasgow&DRILL_SEARCH_TERM=London+Road%2C+GLASGOW%2C+Glasgow+City&DD_UNITARY_AUTHORITY=Glasgow+City&DD_TOWN=GLASGOW&DD_STREET=London+Road&DEBUG_LEVEL=0&SEARCH_METHOD=++&&PT=1#results'
url = 'https://www.saa.gov.uk/search/?SEARCHED=1&SEARCH_TABLE=valuation_roll_cpsplit&SEARCH_TERM=glasgow%2C'+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&PAGE='+str(page_num)+'&DISPLAY_COUNT=10&ASSESSOR_ID=&TYPE_FLAG=CPD&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=glasgow&DRILL_SEARCH_TERM='+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&DD_UNITARY_AUTHORITY=Glasgow+City&DD_TOWN=GLASGOW&DD_STREET='+ggw100_streets+'&DEBUG_LEVEL=0&SEARCH_METHOD=++&&PT=1#results'
response = session.get(url)
html = soup(response.text, 'lxml')
prop_link = html.find_all("a", class_="pagelink button small")
if len(prop_link) != 0:
for link in prop_link:
prop_url = base_url+(link["href"])
#print prop_url
response = session.get(prop_url)
prop = soup(response.content,"lxml")
LeftBlockData = prop.find_all("div", class_="columns small-7 medium-8 cell")
Reference = LeftBlockData[0].get_text().strip()
print (Reference)
writer.writerow([prop_url, Reference])
page_num+=1
else:
empty_rslt= True