由于Flipkart.com在第一页上仅显示15到20个结果,因此滚动显示更多结果。 Scrapy成功提取第1页的结果,但不提取下一页的结果。我尝试使用Selenium,但找不到成功。 这是我的代码: -
from scrapy.spider import Spider
from scrapy.selector import Selector
from flipkart.items import FlipkartItem
from scrapy.spider import BaseSpider
from selenium import webdriver
class FlipkartSpider(BaseSpider):
name = "flip1"
allowed_domains = ["flipkart.com"]
start_urls = [
"http://www.flipkart.com/beauty-and-personal-care/personal-care-appliances/hair-dryers/pr?sid=t06,79s,mh8&otracker=nmenu_sub_electronics_0_Hair%20Dryers"
]
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
sel = Selector(response)
self.driver.get(response.url)
while True:
next = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
try:
sites = sel.select('//div[@class="gd-col gu12 browse-product fk-inf-scroll-item"] | //div[@class="pu-details lastUnit"]')
for site in sites:
item = FlipkartItem()
item['title'] = site.select('div//a[@class="lu-title"]/text() | div[1]/a/text()').extract()
item['price'] = site.select('div//div[@class="pu-price"]/div/text() | div//div[@class="pu-final"]/span/text()').extract()
yield item
next.wait_for_page_to_load("30")
except:
break
self.driver.close()
,我的items.py是: -
import scrapy
class FlipkartItem(scrapy.Item):
title=scrapy.Field()
price=scrapy.Field()
和以下输出我只得到15项: -
[{"price": ["Rs. 599"], "title": ["\n Citron Elegant 1400 W HD001 Hair Dryer (Pink)\n "]},
{"price": ["Rs. 799"], "title": ["\n Citron Vogue 1800 W HD002 Hair Dryer (White)\n "]},
{"price": ["Rs. 645"], "title": ["\n Philips HP8100/00 Hair Dryer (Blue)\n "]},
{"price": ["Rs. 944"], "title": ["\n Philips HP8111/00 Hair Dryer\n "]},
{"price": ["Rs. 171"], "title": ["\n Nova Professional With 2 Speed NV-1290 Hair Dryer (Pink...\n "]},
{"price": ["Rs. 175"], "title": ["\n Nova NHD 2840 Hair Dryer\n "]},
{"price": ["Rs. 775"], "title": ["\n Philips HP 8112 Hair Dryer\n "]},
{"price": ["Rs. 1,925"], "title": ["\n Philips HP8643/00 Miss Fresher's Pack Hair Straightener...\n "]},
{"price": ["Rs. 144"], "title": ["\n Nova Foldable N-658 Hair Dryer (White, Pink)\n "]},
{"price": ["Rs. 1,055"], "title": ["\n Philips HP8100/46 Hair Dryer\n "]},
{"price": ["Rs. 849"], "title": ["\n Panasonic EH-ND12-P62B Hair Dryer (Pink)\n "]},
{"price": ["Rs. 760"], "title": ["\n Panasonic EH-ND11 Hair Dryer (White)\n "]},
{"price": ["Rs. 1,049"], "title": ["\n Panasonic EH-ND13-V Hair Dryer (Violet)\n "]},
{"price": ["Rs. 1,554"], "title": ["\n Philips 1600 W HP4940 Hair Dryer (White & Light Pink)\n "]},
{"price": ["Rs. 2,008"], "title": ["\n Philips Kerashine HP8216/00 Hair Dryer\n "]}]
答案 0 :(得分:1)
你必须强制webdriver加载更多结果。 为了能够与其他结果进行交互,webdriver需要滚动页面直到元素出现。
滚动代码为:
driver.execute_script("window.scrollTo(0, location.get('y')")
要确定滚动的位置,您可以在页面的下半部分找到一个元素(例如页脚)并继续滚动到它。要获取元素的坐标,您可以使用Webelement属性位置
driver = webdriver.Firefox()
down = driver.find_element_by_xpath("//someXpath")
location = down.location
答案 1 :(得分:1)
您可以使用Javascript向下滚动页面。
以下代码会将页面向下滚动10000,10000 in x&方向。因为10000是大数字所以它会带你到页面底部。一旦你到达底部,翻转卡片就会激活AJAX请求以加载更多项目。
window.scrollBy(10000,10000);
我不确定如何在scrapy中做到这一点,但使用硒很容易。
这是代码
((JavascriptExecutor) driver).executeScript("window.scrollBy(10000,10000);");
答案 2 :(得分:1)
我管理的方式有所不同..请参阅我的代码以供进一步参考。适用于完整网站..
class FlipkartSpider(BaseSpider):
name = "flip1"
allowed_domains = ["flipkart.com"]
start_urls = [
"http://www.flipkart.com/tablets/pr?sid=tyy%2Chry&q=mobile&ref=b8b64676-065a-445c-a6a1-bc964d5ff938"
]
'''def is_element_present(self, finder, selector, wait_time=None):
wait_time = wait_time or self.wait_time
end_time = time.time() + wait_time
while time.time() < end_time:
if finder(selector):
return True
return False
def is_element_present_by_xpath(self, xpath, wait_time=None):
return self.is_element_present(self.find_by_xpath, xpath, wait_time)
'''
def __init__(self):
self.driver = webdriver.Firefox()
def parse(self, response):
sel = Selector(response)
self.driver.get(response.url)
block="block"
hyper="http://www.flipkart.com"
print hyper
#i=0
while True:
self.driver.execute_script("window.scrollTo(10000000,10000000)")
self.driver.set_page_load_timeout(10000)
try:
show = self.driver.find_element_by_xpath('//div[@id="show-more-results"]').value_of_css_property('display')
if show==block:
self.driver.find_element_by_xpath('//div[@id="show-more-results"]').click()
no_more = self.driver.find_element_by_xpath('//*[@id="no-more-results" and @class="dont-show"]').value_of_css_property('display')
if no_more==block:
break;
time.sleep(5)
self.driver.execute_script("window.scrollTo(10000000,10000000)")
self.driver.set_page_load_timeout(10000)
#if i==7:
# break
except NoSuchElementException:
print "pungi"
break
#down = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
#location = down.location
#self.((JavascriptExecutor) driver).executeScript("window.scrollBy(10000,10000);");
#next = self.driver.find_element_by_xpath('//div[@id="show-more-results"]')
response = TextResponse(url=response.url, body=self.driver.page_source, encoding='utf-8')
try:
#self.driver.set_page_load_timeout(10000)
#driver.execute_script("window.scrollTo(0, location.get('y')")
sites = response.xpath('//div[@class="gd-col gu12 browse-product fk-inf-scroll-item"] | //div[@class="pu-details lastUnit"] | //div[@class="pu-visual-section"]')
for site in sites:
item = FlipkartItem()
item['title'] = site.xpath('div//a[@class="lu-title"]/text() | div[1]/a/text()').extract()
item['price'] = site.xpath('div//div[@class="pu-price"]/div/text() | div//div[@class="pu-final"]/span/text()').extract()
item['rating'] = site.xpath('div[@class="pu-rating"]/div/@title').extract()
item['image'] = site.xpath('a/img/@src').extract()
data = site.xpath('a/@href').extract()
print data
item['link'] = data
#print rating
yield item
'''for site in sites:
item = FlipkartItem()
item['title'] = site.xpath('div//a[@class="lu-title"]/text() | div[1]/a/text()').extract()
item['price'] = site.xpath('div//div[@class="pu-price"]/div/text() | div//div[@class="pu-final"]/span/text()').extract()
item['rating'] = site.xpath('div[@class="pu-rating"]/div/@title').extract()
#print rating
yield item'''
#next.click()
#self.driver.execute_script("window.scrollTo(10000000,10000000)")
except:
#break
a=10
self.driver.close()