我正在尝试抓取一个显示更多结果选项的网页。像这样设置restrict xpath ...
(restrict_xpaths='//a[@href="#"]')
但是,由于结果显示在同一页面上,scrapy希望再次抓取所有结果并收到“重复结果”并在运行爬虫时崩溃。我相信阅读后可能与dont_filter选项有关吗?
以下是该网站的链接......
http://www.skiddle.com/whats-on/London/?sort=18&from_date=15+Apr+2015&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&view=gallery
谢谢你们
我的蜘蛛
import scrapy # Import required libraries.
from scrapy.selector import HtmlXPathSelector # Allows for path detection in a websites code.
from scrapy.spider import BaseSpider # Used to create a simple spider to extract data.
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor # Needed for the extraction of href links in HTML to crawl further pages.
from scrapy.contrib.spiders import CrawlSpider # Needed to make the crawl spider.
from scrapy.contrib.spiders import Rule # Allows specified rules to affect what the link
from urlparse import urlparse
import soundcloud
import mysql.connector
import requests
import time
from datetime import datetime
from scrapy.http import FormRequest
from tutorial.items import TutorialItem
class Skiddle_ClubSpider(CrawlSpider):
name = "Skiddle_Club" # Name of the Spider. In command promt, when in the correct folder, enter "scrapy crawl Allgigs".
allowed_domains = ["www.skiddle.com"] # Allowed domains is a String NOT a URL.
start_urls = [
"http://www.skiddle.com/whats-on/London/?sort=18&from_date=15+Apr+2015&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&eventcodes%5B%5D=4&view=gallery"
]
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//a[@href="#"]'), # Search the start URL's for
callback="parse_item1",
follow=True),
]
def parse_start_url(self, response):#http://stackoverflow.com/questions/15836062/scrapy-crawlspider-doesnt-crawl-the-first-landing-page
return self.parse_item1(response)
def parse_item1(self, response):
items = []
for info in response.xpath('//div[@class="moveable"]'):
item = TutorialItem() # Extract items from the items folder.
item ['table'] = "London"
item ['artist'] = info.xpath('.//div[@class="summary"]//text()').extract()[1] # Extract artist information.
items.append(item)
return items
答案 0 :(得分:1)
我认为爬行蜘蛛BaseSpider将会完成这项任务, 检查下面的代码
from scrapy.spider import Spider # Used to create a simple spider to extract data.
from datetime import datetime
from scrapy.http import Request
from skiddle.items import *
import re
class Skiddle_ClubSpider(Spider):
name = "Skiddle_Club"
allowed_domains = ["www.skiddle.com"]
start_urls = [
"http://www.skiddle.com/whats-on/London/?sort=18&from_date=15+Apr+2015&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&eventcodes%5B%5D=4&view=gallery"
]
def __init__(self, name=None, **kwargs):
self.today = datetime.today().strftime('%d+%b+%Y')
self.start_urls = ["http://www.skiddle.com/whats-on/London/?sort=18&from_date={}&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&eventcodes%5B%5D=4&view=gallery".format(self.today)]
def parse(self, response):
_next_page_url = 'http://www.skiddle.com/whats-on/London/?ajaxing=1&sort=18&from_date={}&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&view=gallery&o=0&bannertitle={}'.format(self.today,self.today)
yield Request(url=response.url, meta={'next': _next_page_url} , callback=self.parse_artist)
def parse_artist(self, response):
artists = response.xpath('//div[@class="summary"]/a/text()').extract()
if artists:
for artist in artists:
item = TutorialItem( # Extract items from the items folder.
table="London",
artist=artist.replace('\\t', '').replace('\\r', '').replace('\\n', '')
)
yield item
else:
return
page_index = re.findall('&o=(\d+)&bannertitle',response.meta['next'])
if page_index:
next_page_url = 'http://www.skiddle.com/whats-on/London/?ajaxing=1&sort=18&from_date={}&to_date=&radius=10&rkeep=&eventcodes%5B%5D=6&view=gallery&o={}&bannertitle={}'.format(self.today,str(int(page_index[0])+ 24) ,self.today)
yield Request(url=next_page_url, callback=self.parse_artist, meta={'next': next_page_url})
使用ajax请求进行分页,您可以在 next_page_url 中查看ajax网址,并根据抓取日期更新start-url。