当我运行我的刮刀时,它从一个站点刮掉大约200条记录,而该站点包含大约250条记录。我无法弄清楚我在制作它时犯的任何错误。任何帮助都将受到极大的赞赏。
import scrapy
class WiseowlItem(scrapy.Item):
Name = scrapy.Field()
Url= scrapy.Field()
名为“wiseowlsp.py”的from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class WiseowlspSpider(CrawlSpider):
name = "wiseowlsp"
allowed_domains = ['www.wiseowl.co.uk']
start_urls = ['http://www.wiseowl.co.uk/videos/']
rules = [Rule(LinkExtractor(restrict_xpaths='//li[@class="woMenuItem"]')),
Rule(LinkExtractor(restrict_xpaths='//div[@class="woPaging tac"]'),
callback='parse_items')]
def parse_items(self, response):
page = response.xpath('//div[@class="woVideoListRow"]')
for title in page:
AA = title.xpath('.//p[@class="woVideoListDefaultSeriesTitle"]/a/text()').extract()
BB = title.xpath('.//p[@class="woVideoListDefaultSeriesTitle"]/a/@href').extract()
yield {'Name':AA,'Url':BB}
如果我使用我要粘贴的样式,我会得到我想要的结果,但我希望避免使用正则表达式。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from wiseowl.items import WiseowlItem
class WiseowlspSpider(CrawlSpider):
name = "wiseowlsp"
allowed_domains = ["wiseowl.co.uk"]
start_urls = ['http://www.wiseowl.co.uk/videos/']
rules = [Rule(LinkExtractor(allow=('uk/videos/.*')),callback='parse_items', follow=True)]
def parse_items(self, response):
page = response.xpath('//div[@class="woVideoListRow"]')
for title in page:
item=WiseowlItem()
item["Name"] = title.xpath('.//p[@class="woVideoListDefaultSeriesTitle"]/a/text()').extract()
item["Url"] = title.xpath('.//p[@class="woVideoListDefaultSeriesTitle"]/a/@href').extract()
yield item
在这种情况下,restrict_xpaths总是省略第一页并开始从下一页开始抓取直到它结束。我相信应该有任何方法或方法(限制在这个restrict_xpaths模式中)应用第一页的哪些数据也可以被抓取。期待有人推动。
答案 0 :(得分:2)
我讨厌使用典型的Rule
和LinkExtractor
,这很难理解,Scrapy自己做了一切。
我总是喜欢使用start_requests
方法,这是Spider的入口点
对于您正在抓取的网站,我首先会在脑海中开发逻辑,然后将其转换为代码。
这是100%正常工作的代码。
from scrapy.contrib.spiders import CrawlSpider
from scrapy.http.request import Request
import logging
class WiseowlspSpider(CrawlSpider):
name = "wiseowlsp"
def start_requests(self):
# got to home page
yield Request(url = "http://www.wiseowl.co.uk/videos/", callback = self.parse_home_page)
def parse_home_page(self, response):
# parse all links on left
for cat in response.css(".woMenuList > li"):
logging.info("\n\n\nScraping Category: %s" % (cat.css("a::text").extract_first()))
yield Request(url = "http://www.wiseowl.co.uk" + cat.css("a::attr(href)").extract_first() , callback = self.parse_listing_page)
def parse_listing_page(self, response):
items = response.xpath('//div[@class="woVideoListRow"]')
for title in items:
AA = title.xpath('.//p[@class="woVideoListDefaultSeriesTitle"]/a/text()').extract()
BB = title.xpath('.//p[@class="woVideoListDefaultSeriesTitle"]/a/@href').extract()
yield {'Name':AA,'Url':BB}
next_page = response.css("a.woPagingNext::attr(href)").extract_first()
if next_page is not None:
logging.info("\n\n\nGoing to next page %s" % (next_page))
# If there is next page scrape it
yield Request(url = "http://www.wiseowl.co.uk" + next_page , callback = self.parse_listing_page)
else:
for more_pages in response.css("a.woPagingItem"):
next_page = more_pages.css("::attr(href)").extract_first()
logging.info("\n\n\nGoing to next page %s" % (next_page))
# If there is next page scrape it
yield Request(url = "http://www.wiseowl.co.uk" + next_page , callback = self.parse_listing_page)
并在settings.py
中写下此
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
现在你可以看到我的代码可以从上到下轻松阅读,你可以理解它的逻辑。