如何使用Python存储来自aspx网站的多个页面

时间:2018-06-19 14:04:48

标签: python asp.net web-scraping

我在使用此代码时遇到了麻烦。这是一个aspx网站,我循环搜索了所有页面。当我运行代码时,它可以正常工作并遍历每个页面,但是我只能在数据集中存储第一页。

有人可以帮助我吗?我的编码技能是基本的,所以我可能犯了一个简单的错误...

import scraperwiki
import requests
import mechanize
import re
from lxml import html


dataset = {}
def get_table(root):
    rows = root.cssselect("table.resultstable tr")
    index = 0
    for row in rows:
        cells = row.cssselect("td")
        if cells:
            index = index+1
            dataset['index'] = index
            dataset['Beneficiary'] = cells[0].text_content()
            dataset['Postcode'] = cells[1].text_content()
            dataset['Town'] = cells[2].text_content()
            dataset['Rural development'] = cells[3].text_content()
            dataset['Direct Aid'] = cells[4].text_content()
            dataset['Market Schemes'] = cells[5].text_content()
            dataset['Total'] = cells[6].text_content()
            dataset['Responsible paying agency'] = cells[7].text_content()
            Links = row.cssselect("a")
            dataset['Details'] = Links[0].attrib['href']
            scraperwiki.sqlite.save(["index"], dataset)


url1 = "http://cap-payments.defra.gov.uk/SearchResults.aspx?Page="
number = range(1,11)
last_part = "&Sort="
for n in number:
    url = url1+str(n)+last_part
    print url
    br = mechanize.Browser()
    response = br.open(url)
    print "All forms:", [ form.name for form in br.forms() ]
    br.select_form(name="aspnetForm")
    print br.form
    br["ctl00$Center$ContentPlaceHolder1$SearchControls1$ddlFinancialYear"] = ['2017']
    response = br.submit()
    web = response.read()
    root = html.fromstring(web)
    get_table(root)

0 个答案:

没有答案