因此,我正在尝试抓取一个网站,并且希望抓取许多表格。问题是,当我使用这两个for循环时,它将按月和按月抓取,但是它将混合来自不同月份和年份的数据,而不是按照循环所定义的顺序给出表。知道如何解决这个问题吗?
import scrapy
from ..items import RenItem
from scrapy.utils.response import open_in_browser
from scrapy.http import FormRequest
class ScrapeTableSpider(scrapy.Spider):
name = 'scrape-table'
allowed_domains = ['https://www.centrodeinformacao.ren.pt/PT/InformacaoExploracao/Paginas/EstatisticaMensal.aspx']
start_urls = ['https://www.centrodeinformacao.ren.pt/PT/InformacaoExploracao/Paginas/EstatisticaMensal.aspx']
def parse(self, response):
for j in range (2007,2009):
# print(j)
yield FormRequest.from_response(response, formdata={
'ctl00$m$g_9b99ffea_e036_46c7_9be7_88c49a7820ac$ddlAnos': str(j)}, callback=self.parse2, dont_filter=True)
def parse2(self, response):
for i in range (1,4):
#print(i)
yield FormRequest.from_response(response, formdata = {
'ctl00$m$g_9b99ffea_e036_46c7_9be7_88c49a7820ac$ddlMeses': str(i),
'ctl00$m$g_9b99ffea_e036_46c7_9be7_88c49a7820ac$cmdCxecutar': 'Executar'},
callback=self.start_scraping, dont_filter=True)
def start_scraping(self, response):
open_in_browser(response)
items = RenItem()
for row in response.xpath('//tr[@class="grid_row"]'):
renmensal = row.xpath('td[1]//text()').extract()
mes1 = row.xpath('td[2]//text()').extract()
acum1 = row.xpath('td[3]//text()').extract()
mes2 = row.xpath('td[4]//text()').extract()
acum2 = row.xpath('td[5]//text()').extract()
mes_variacao = row.xpath('td[6]//text()').extract()
acum_variacao = row.xpath('td[7]//text()').extract()
if len(renmensal) == 0:
items['renmensal'] = ' '
else:
items['renmensal'] = renmensal
if len(mes1) == 0:
items['mes1'] = ' '
else:
items['mes1'] = mes1
if len(acum1) == 0:
items['acum1'] = ' '
else:
items['acum1'] = acum1
if len(mes2) == 0:
items['mes2'] = ' '
else:
items['mes2'] = mes2
if len(acum2) == 0:
items['acum2'] = ' '
else:
items['acum2'] = acum2
if len(mes_variacao) == 0:
items['mes_variacao'] = ' '
else:
items['mes_variacao'] = mes_variacao
if len(acum_variacao) == 0:
items['acum_variacao'] = ' '
else:
items['acum_variacao'] = acum_variacao
yield items