尝试刮取“显示更多结果”时出现重复结果错误。 scrapy中的href

时间:2015-04-15 16:14:22

标签: python scrapy screen-scraping duplicates

我正在尝试抓取一个显示更多结果选项的网页。像这样设置restrict xpath ...

(restrict_xpaths='//a[@href="#"]')

但是,由于结果显示在同一页面上,scrapy希望再次抓取所有结果并收到“重复结果”并在运行爬虫时崩溃。我相信阅读后可能与dont_filter选项有关吗?

以下是该网站的链接......

http://www.skiddle.com/whats-on/London/?sort=18&from_date=15+Apr+2015&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&view=gallery

谢谢你们

我的蜘蛛

import scrapy # Import required libraries.
from scrapy.selector import HtmlXPathSelector # Allows for path detection in a websites code.
from scrapy.spider import BaseSpider # Used to create a simple spider to extract data.
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor # Needed for the extraction of href links in HTML to crawl further pages.
from scrapy.contrib.spiders import CrawlSpider # Needed to make the crawl spider.
from scrapy.contrib.spiders import Rule # Allows specified rules to affect what the link 
from urlparse import urlparse
import soundcloud
import mysql.connector
import requests
import time
from datetime import datetime
from scrapy.http import FormRequest

from tutorial.items import TutorialItem

class Skiddle_ClubSpider(CrawlSpider):
    name = "Skiddle_Club" # Name of the Spider. In command promt, when in the correct folder, enter "scrapy crawl Allgigs".
    allowed_domains = ["www.skiddle.com"] # Allowed domains is a String NOT a URL. 
    start_urls = [
        "http://www.skiddle.com/whats-on/London/?sort=18&from_date=15+Apr+2015&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&eventcodes%5B%5D=4&view=gallery"
    ] 

    rules = [
        Rule(SgmlLinkExtractor(restrict_xpaths='//a[@href="#"]'), # Search the start URL's for 
        callback="parse_item1", 
        follow=True),
    ]

    def parse_start_url(self, response):#http://stackoverflow.com/questions/15836062/scrapy-crawlspider-doesnt-crawl-the-first-landing-page
        return self.parse_item1(response)

    def parse_item1(self, response):
        items = []
        for info in response.xpath('//div[@class="moveable"]'):
            item = TutorialItem() # Extract items from the items folder.
            item ['table'] = "London"
            item ['artist'] = info.xpath('.//div[@class="summary"]//text()').extract()[1] # Extract artist information.

            items.append(item)

        return items

1 个答案:

答案 0 :(得分:1)

我认为爬行蜘蛛BaseSpider将会完成这项任务, 检查下面的代码

from scrapy.spider import Spider # Used to create a simple spider to extract data.
from datetime import datetime
from scrapy.http import  Request
from skiddle.items import *
import re

class Skiddle_ClubSpider(Spider):
    name = "Skiddle_Club"
    allowed_domains = ["www.skiddle.com"]
    start_urls = [
        "http://www.skiddle.com/whats-on/London/?sort=18&from_date=15+Apr+2015&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&eventcodes%5B%5D=4&view=gallery"
    ] 

    def __init__(self, name=None, **kwargs):
        self.today = datetime.today().strftime('%d+%b+%Y')
        self.start_urls = ["http://www.skiddle.com/whats-on/London/?sort=18&from_date={}&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&eventcodes%5B%5D=4&view=gallery".format(self.today)]

    def parse(self, response):
        _next_page_url = 'http://www.skiddle.com/whats-on/London/?ajaxing=1&sort=18&from_date={}&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&view=gallery&o=0&bannertitle={}'.format(self.today,self.today)
        yield Request(url=response.url, meta={'next': _next_page_url} , callback=self.parse_artist)

    def parse_artist(self, response):
        artists = response.xpath('//div[@class="summary"]/a/text()').extract()
        if artists:
            for artist in artists:
                item = TutorialItem( # Extract items from the items folder.
                table="London",
                artist=artist.replace('\\t', '').replace('\\r', '').replace('\\n', '')
                )
                yield item
        else:
            return

        page_index = re.findall('&o=(\d+)&bannertitle',response.meta['next'])
        if page_index:
            next_page_url = 'http://www.skiddle.com/whats-on/London/?ajaxing=1&sort=18&from_date={}&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&view=gallery&o={}&bannertitle={}'.format(self.today,str(int(page_index[0])+ 24) ,self.today) 
            yield Request(url=next_page_url, callback=self.parse_artist, meta={'next': next_page_url})  

使用ajax请求进行分页,您可以在 next_page_url 中查看ajax网址,并根据抓取日期更新start-url。