我编写了一个抓取工具,用于从网站中提取价格,名称和评论。但是当我把他们聚在一起时,只有9个结果显示在30个中。不知道问题出在哪里。此外,我需要添加下一页链接,我会这样做。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
import unittest, time, re
import time
from scrapy.item import Item, Field
from selenium import webdriver
from scrapy.spider import BaseSpider
from scrapy.contrib.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
import urllib2
import sys;
reload(sys);
sys.setdefaultencoding("utf8")
class Agoda(CrawlSpider):
name = 'agoda'
allowed_domains = ["agoda.com"]
start_urls = ["http://www.agoda.com"]
driver = webdriver.Firefox()
driver.get("http://www.agoda.com")
driver.find_element_by_id("ctl00_ctl00_MainContent_area_promo_HomeSearchBox1_TextSearch1_searchText").clear()
driver.find_element_by_id("ctl00_ctl00_MainContent_area_promo_HomeSearchBox1_TextSearch1_searchText").send_keys("Mumbai")
driver.find_element_by_xpath("//select[contains(@id,'ddlCheckInDay')]")
driver.find_element_by_xpath("//option[contains(.,'Mon 09')]").click()
driver.find_element_by_id("ctl00_ctl00_MainContent_area_promo_HomeSearchBox1_SearchButton").click()
driver.find_element_by_id("ctl00_ContentMain_rptAB1936_ctl01_rptSearchResultAB1936_ctl01_lnkResult1936" or "ctl00_ContentMain_rptSearchResult_ctl01_lnkResult" or "ctl00_ContentMain_rptSearchResult_ctl01_lnkResult").click()
#driver.find_element_by_id("ctl00_ContentMain_rptSearchResult_ctl01_lnkResult").click()
time.sleep(40);
#print driver.page_source
TotalResults = driver.find_element_by_xpath("//span[@class='blue ssr_search_text']")
print TotalResults.text
html_source = driver.page_source
soup = BeautifulSoup(html_source)
names = soup("a", {"class":"hot_name"})
prices = soup("span", {"class":"fontxlargeb purple"})
reviews = soup("a", {"class":"fontlargeb"})
hotel_names = [name[1].get_text() for name in enumerate(names)] #or [name[1].get_text() for name in enumerate(names)]
prices = [price[1].get_text() for price in enumerate(prices)]
reviews = [review[1].get_text() for review in enumerate(reviews)] #[price[1].get_text() for price in enumerate(prices)]
name_price_list = zip(hotel_names, prices, reviews)
for name, price, review in name_price_list:
print name, price, review
答案 0 :(得分:3)
您可以使用itertools中的izip_longest
创建一个聚合来自每个迭代的元素的迭代器。 如果迭代的长度不均匀,则填充缺失值 有fillvalue。迭代继续,直到最长的可迭代为止 用尽
示例:
>>> import itertools
>>> l2 = ['a','b','c']
>>> l1 = [1, 2]
>>> list(itertools.izip_longest(l1, l2))
[(1, 'a'), (2, 'b'), (None, 'c')]