当我从表中刮掉10行但不是100时,python代码会更改网址页码

时间:2017-03-23 11:14:51

标签: python html python-2.7 url beautifulsoup

行中有100行的

网址为Link-1

行中有10行的网址与显示计数= 10,Link-2相同。

当我使用display count = 10进行刮擦时,我的代码正在更改页面,尽管它在第11页开始刮擦。但是当我尝试使用100显示计数时使用相同的代码时,它会在100处停止。这是尽管以下链接我只需将页面更改为= 1,在浏览器中显示101-200行。知道为什么我的代码不能刮掉。完整代码如下。提前谢谢。
Liink-3

#!/usr/bin/env python
# -*- coding: utf-8 -*- 

import requests
from bs4 import BeautifulSoup as soup
import csv


outputfilename = 'ggw_LondonRoad10.csv'

inputfilename = 'ggw100_streets.txt'
#inputfilename = 'Edinburgh.txt'

outputfile = open(outputfilename, 'wb')
writer = csv.writer(outputfile)

writer.writerow(['URL', 'Local Council','Town','Locality','Street','Reference', 'Description', 'Address', 'Proprietor','Tenant','Occupier','Current Value','Pre-Revaluation 2010','Proposed Value 2017'])


base_url = 'https://www.saa.gov.uk'

session = requests.session()

#read input file
inputfile = open(inputfilename,'rb')  
streets = inputfile.readlines()

for a_street in streets :

#ed_street=a_street.strip('\r\n')
ggw100_streets=a_street.strip('\r\n')

page_num = 0

url = 'https://www.saa.gov.uk/search/?SEARCHED=1&SEARCH_TABLE=valuation_roll_cpsplit&SEARCH_TERM=glasgow%2C'+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&PAGE='+str(page_num)+'&DISPLAY_COUNT=100&ASSESSOR_ID=&TYPE_FLAG=CPD&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=glasgow&DRILL_SEARCH_TERM='+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&DD_UNITARY_AUTHORITY=Glasgow+City&DD_TOWN=GLASGOW&DD_STREET='+ggw100_streets+'&DEBUG_LEVEL=0&SEARCH_METHOD=++&&PT=1#results'

response = session.get(url)

if 'postcode incorrectly' in response.content:

    print 'Street unknown:', ggw100_streets

else:
    empty_rslt= False
    while not empty_rslt :
        #url = 'https://www.saa.gov.uk/search/?SEARCHED=1&SEARCH_TABLE=valuation_roll_cpsplit&SEARCH_TERM=glasgow%2CLondon+Road%2C+GLASGOW%2C+Glasgow+City&PAGE='+str(page_num)+'&DISPLAY_COUNT=10&ASSESSOR_ID=&TYPE_FLAG=CPD&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=glasgow&DRILL_SEARCH_TERM=London+Road%2C+GLASGOW%2C+Glasgow+City&DD_UNITARY_AUTHORITY=Glasgow+City&DD_TOWN=GLASGOW&DD_STREET=London+Road&DEBUG_LEVEL=0&SEARCH_METHOD=++&&PT=1#results'
        url = 'https://www.saa.gov.uk/search/?SEARCHED=1&SEARCH_TABLE=valuation_roll_cpsplit&SEARCH_TERM=glasgow%2C'+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&PAGE='+str(page_num)+'&DISPLAY_COUNT=10&ASSESSOR_ID=&TYPE_FLAG=CPD&ORDER_BY=PROPERTY_ADDRESS&H_ORDER_BY=SET+DESC&ORIGINAL_SEARCH_TERM=glasgow&DRILL_SEARCH_TERM='+ggw100_streets+'%2C+GLASGOW%2C+Glasgow+City&DD_UNITARY_AUTHORITY=Glasgow+City&DD_TOWN=GLASGOW&DD_STREET='+ggw100_streets+'&DEBUG_LEVEL=0&SEARCH_METHOD=++&&PT=1#results'
        response = session.get(url)
        html = soup(response.text, 'lxml')

        prop_link = html.find_all("a", class_="pagelink button small")
        if len(prop_link) != 0:
            for link in prop_link:

                prop_url = base_url+(link["href"])
            #print prop_url

                response = session.get(prop_url)
                prop = soup(response.content,"lxml")


                LeftBlockData = prop.find_all("div", class_="columns small-7 medium-8 cell")
                Reference = LeftBlockData[0].get_text().strip()
                print (Reference)

                writer.writerow([prop_url, Reference])

            page_num+=1
        else:
            empty_rslt= True

0 个答案:

没有答案