Scrapy Double提取细胞

时间:2017-03-08 22:49:41

标签: python web-scraping scrapy

我有这个脚本工作正常我认为我想要但我遇到一个问题。

class MySpider(scrapy.Spider):
name = 'Offers'
start_urls = ['https://www.invia.cz/direct/community_login/ajax-login/?ac_email=zisi%40electratours.cz&ac_password=electra1999']


def start_requests(self):
    return [scrapy.FormRequest("https://www.invia.cz/direct/community_login/ajax-login/?ac_email=zisi%40electratours.cz&ac_password=electra1999",
                               formdata={'user': '', 'pass': ''},
                               callback=self.after_login)]


def after_login(self, response):

        return scrapy.Request(url="https://dovolena.invia.cz/direct/tour_search/ajax-next-boxes/?nl_country_id%5B%5D=28&nl_locality_id%5B%5D=19&d_start_from=08.03.2017&d_end_to=28.05.2017&nl_transportation_id%5B%5D=3&nl_ck_id%5B%5D=62&sort=nl_sell&page=1&getOptionsCount=true&base_url=https%3A%2F%2Fdovolena.invia.cz%2F",callback=self.logged_in)

def logged_in(self, response):
    data=json.loads(response.text)

    sel=Selector(text=data['boxes_html'])
    ag=Selector(text=data['boxes_html'])
    ko=sel.css('li.hotel-box').extract()  

    for x in range(0,len(ko)):
      sel=Selector(text=ko[x])

      name=sel.css('span.name::text').extract_first()

      data=sel.xpath('//li/@data-content-value').extract()

      IC=[json.loads(d)["nl_hotel_id"] for d in data]

      for x in range(0,len(data)):
       IC[x]=''.join(map(str, IC[x]))

      ICn=[j.strip() for j in IC]

      print (ICn[x])

      url1="https://dovolena.invia.cz/direct/tour_search/ajax-next-box-rows/"

      url2=ICn[x]

      url3="/?nl_country_id%5B0%5D=28&nl_locality_id%5B0%5D=19&d_start_from=08.03.2017&d_end_to=28.05.2017&nl_transportation_id%5B0%5D=3&nl_ck_id%5B0%5D=62&sort=nl_sell&page=2&boxPage=1"

      url_final=url1+url2+url3

      print(url_final)

      request=scrapy.Request(url=str(url_final),callback=self.hotel)

      request.meta["name"]=name

      yield request


    next_page = ag.css("a.next::attr(data-page)").extract_first()
    print("Page %s -1" %next_page)
    url = re.sub('page=\d+', 'page=' + next_page, response.url)
    yield scrapy.Request(url, self.logged_in)

def hotel(self,response):
  names = response.meta['name']
  dates=response.css("strong.date").extract()
  prices=response.css("strong.orange").extract()
  airport=response.css("p.info>span::attr(title)").extract()
  meal=response.css("span.blue::text").extract()
  minute=response.css("span.symptom::text").extract()




  yield{


   "Hotel":names,  
   "Dates":dates,
   "Airport":airport,
   "MealType":meal,
   "Minute":minute
  }

  if(response.css("a.next")):
    next_page = response.css("a.next::attr(rel)").extract_first()
    url = re.sub('boxPage=\d+', 'boxPage=' + next_page, response.url)
    yield scrapy.Request(url, self.hotel)
  else:
    return

当我爬行并将它们全部放入csv输出中时,我看到两个"刮掉"单个单元格中的数据,我似乎无法找到解决方法或解决方法的原因。

0 个答案:

没有答案