我有这个脚本工作正常我认为我想要但我遇到一个问题。
class MySpider(scrapy.Spider):
name = 'Offers'
start_urls = ['https://www.invia.cz/direct/community_login/ajax-login/?ac_email=zisi%40electratours.cz&ac_password=electra1999']
def start_requests(self):
return [scrapy.FormRequest("https://www.invia.cz/direct/community_login/ajax-login/?ac_email=zisi%40electratours.cz&ac_password=electra1999",
formdata={'user': '', 'pass': ''},
callback=self.after_login)]
def after_login(self, response):
return scrapy.Request(url="https://dovolena.invia.cz/direct/tour_search/ajax-next-boxes/?nl_country_id%5B%5D=28&nl_locality_id%5B%5D=19&d_start_from=08.03.2017&d_end_to=28.05.2017&nl_transportation_id%5B%5D=3&nl_ck_id%5B%5D=62&sort=nl_sell&page=1&getOptionsCount=true&base_url=https%3A%2F%2Fdovolena.invia.cz%2F",callback=self.logged_in)
def logged_in(self, response):
data=json.loads(response.text)
sel=Selector(text=data['boxes_html'])
ag=Selector(text=data['boxes_html'])
ko=sel.css('li.hotel-box').extract()
for x in range(0,len(ko)):
sel=Selector(text=ko[x])
name=sel.css('span.name::text').extract_first()
data=sel.xpath('//li/@data-content-value').extract()
IC=[json.loads(d)["nl_hotel_id"] for d in data]
for x in range(0,len(data)):
IC[x]=''.join(map(str, IC[x]))
ICn=[j.strip() for j in IC]
print (ICn[x])
url1="https://dovolena.invia.cz/direct/tour_search/ajax-next-box-rows/"
url2=ICn[x]
url3="/?nl_country_id%5B0%5D=28&nl_locality_id%5B0%5D=19&d_start_from=08.03.2017&d_end_to=28.05.2017&nl_transportation_id%5B0%5D=3&nl_ck_id%5B0%5D=62&sort=nl_sell&page=2&boxPage=1"
url_final=url1+url2+url3
print(url_final)
request=scrapy.Request(url=str(url_final),callback=self.hotel)
request.meta["name"]=name
yield request
next_page = ag.css("a.next::attr(data-page)").extract_first()
print("Page %s -1" %next_page)
url = re.sub('page=\d+', 'page=' + next_page, response.url)
yield scrapy.Request(url, self.logged_in)
def hotel(self,response):
names = response.meta['name']
dates=response.css("strong.date").extract()
prices=response.css("strong.orange").extract()
airport=response.css("p.info>span::attr(title)").extract()
meal=response.css("span.blue::text").extract()
minute=response.css("span.symptom::text").extract()
yield{
"Hotel":names,
"Dates":dates,
"Airport":airport,
"MealType":meal,
"Minute":minute
}
if(response.css("a.next")):
next_page = response.css("a.next::attr(rel)").extract_first()
url = re.sub('boxPage=\d+', 'boxPage=' + next_page, response.url)
yield scrapy.Request(url, self.hotel)
else:
return
当我爬行并将它们全部放入csv输出中时,我看到两个"刮掉"单个单元格中的数据,我似乎无法找到解决方法或解决方法的原因。