我想抓一些网站。 问题是我无法抓取下一页
代码:
import scrapy
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import Spider
from scrapy import Selector
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
import urlparse
from scrapy.http.request import Request
class MySpider(Spider):
name = "craig"
allowed_domains = ["mmoga.de"]
start_urls = ["https://www.mmoga.de/BattleNet-Games/",
"https://www.mmoga.de/EA-Games/",
"https://www.mmoga.de/Gamecards-CD-Keys/",
"https://www.mmoga.de/Steam-Games/"]
rules = (Rule (SgmlLinkExtractor(allow=("Seite-\d00", ),restrict_xpaths=('//a',))
, callback="parse", follow= True),
)
def parse(self, response):
hxs = Selector(response)
ne = HtmlXPathSelector(response)
next_page = ne.select("//td[@class='pagerNextlink']/a/@href").extract()
titles = hxs.xpath('//table[@class="list"]/tr')
section = hxs.select('//div[@id="productListFilterContainer"]')
items = []
for title in titles:
item = CraigslistSampleItem()
item ["section"] = section.select('div/h2/text()').extract()
item ["preis"] = title.xpath('td/div[@style="margin-bottom:5px;"]/text()').extract()
if not item ["preis"]:
item ["preis"] = title.xpath('td/div[@style="margin-bottom:5px;color:#F00;"]/text()').extract()
item ["title"] = title.xpath("td/a/text()").extract()
item ["link"] = title.xpath('td/a/@href').extract()
if not item ["preis"]:
pass
else:
items.append(item)
return items
接下来的页面采用以下格式:.../Seite-1
https://www.mmoga.de/Steam-Games/Seite-1
该程序在终端中运行时没有错误:
scrapy crawl craig -o item.csv