我正在为一个学校项目建立一个蜘蛛,我正在从allrecipes.com上搜索食谱。一切都工作得很好,但我似乎无法删除重复的食谱,其中一个网址包含实际的食谱,另一个包含相同的网址"视频=真"追加。
这是我尝试在pipelines.py中处理这个问题:
from scrapy.exceptions import DropItem
from scrapy import log
class DuplicatesPipeline(object):
# minCal = 50
def __init__(self):
self.urls_seen = set()
def process_vids(self, item, spider):
video = "video=true"
url = str(item.get('url'))
if video in url:
raise DropItem("Contains video")
else:
return item
def process_item(self, item, spider):
unique_id = item.get('url')
if unique_id in self.urls_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.urls_seen.add('url')
return item
settings.py:
# Scrapy settings for dirbot project
BOT_NAME = 'dirbot'
SPIDER_MODULES = ['dirbot.spiders']
NEWSPIDER_MODULE = 'dirbot.spiders'
DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
ITEM_PIPELINES = {'dirbot.pipelines.DuplicatesPipeline': 300,}
items.py:
from scrapy.item import Item, Field
class Website(Item):
name = Field()
url = Field()
description = Field()
kcal = Field()
carbs = Field()
fat = Field()
protein = Field()
main = Field()
sugar = Field()
fibre = Field()
author = Field()
rating = Field()
img = Field()
dnot.py:
from scrapy.spider import Spider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.http.request import Request
from dirbot.items import Website
from scrapy.contrib.spiders import CrawlSpider,Rule
import urlparse
import scrapy
page = "http://allrecipes.com/recipes/main.aspx?Page=%d#recipes"
class DmozSpider(Spider):
name = "dnot"
allowed_domains = ["allrecipes.com"]
start_urls = [page % 1]
rules = [Rule(SgmlLinkExtractor(allow=('allrecipes.com'), restrict_xpaths = '//a[contains(.,"NEXT")]'),
callback="parse", follow= True),
]
def __init__(self):
self.page_number = 1
def parse(self, response):
print "-------------------------------------------------"
print self.page_number
print "-------------------------------------------------"
sel = Selector(response)
sites = response.xpath('//div[@id="divGridItemWrapper"]')
items = []
for site in sites:
item = Website()
recipe = response.xpath('//a[contains(@href, "/Recipe/")]/@href').extract()
url = "http://www.allrecipes.com"
for nth in recipe:
go = urlparse.urljoin(url, str(nth))
items.append(item)
for link in go:
yield Request(go, self.recipes)
if self.page_number <= 3:
self.page_number += 1
yield Request(page % self.page_number)
else:
pass
def recipes(self,response):
item = Website()
sel = Selector(response)
recipe = response.xpath('//div[@id="content-wrapper"]')
items = []
print "second page - %s" % response.url
for i in recipe:
item['url'] = response.url
item['description'] = i.xpath('//span[@itemprop="description"]/text()').extract()
item['name'] = i.xpath('//h1[@itemprop="name"]/text()').extract()
item['kcal'] = i.xpath('//ul/li[contains(.,"kcal")]/span/text()').extract()
item['carbs'] = i.xpath('//ul/li[contains(.,"Carbohydrates")]/following-sibling::li[1]//span[@id="lblNutrientValue"]/text()').extract()
item['fat'] = i.xpath('//ul/li[contains(.,"Fat")]/following-sibling::li[1]//span[@id="lblNutrientValue"]/text()').extract()
item['protein'] = i.xpath('//ul/li[contains(.,"Protein")]/following-sibling::li[1]//span[@id="lblNutrientValue"]/text()').extract()
item['main'] = "allrecipes.com"
item['sugar'] = i.xpath('//li/span[@itemprop="sugarContent"]/text()').extract()
item['fibre'] = i.xpath('//li/span[@itemprop="proteinContent"]/text()').extract()
item['author'] = i.xpath('//span[@id="lblUser0"]/text()').extract()
item['rating'] = i.xpath('//div[@class="rating-stars-img"][1]/meta[1][@itemprop="ratingValue"]/@content').extract()
item['img'] = i.xpath('//img[@id="imgPhoto"]/@src').extract()
items.append(item)
yield item
我对Python有点新,我不确定是否需要将项目[&#39; url&#39;]转换为字符串或不是;但是我试过了&#34; str&#34;没有。我也尝试过其他一些用来做类似事情的其他方法,但到目前为止没有任何方法对我有效。
希望有人能指出我正确的方向。提前谢谢!
示例:
项[&#39; url&#39;] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1 item [&#39; url&#39;] = http://allrecipes.com/Recipe/Delicious-Ham-and-Potato-Soup/Detail.aspx?evt19=1&referringHubId=1&video=true
答案 0 :(得分:1)
您需要创建一个class
,在process_item
文件上实现pipelines.py
方法,例如:
from urllib import urlencode
from urlparse import urlparse, urlunparse, parse_qs
class DuplicatesPipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
url = item['url']
u = urlparse(url)
query = parse_qs(u.query)
query.pop('video', None)
u = u._replace(query=urlencode(query, True))
unique_id = urlunparse(u)
if unique_id and unique_id in self.ids_seen:
raise DropItem("Duplicate Item found (%s)" % unique_id)
else:
self.ids_seen.add(unique_id)
return item
然后您需要将该类添加到settings.py
ITEM_PIPELINES = {
'yourproject.pipelines.DuplicatesPipeline': 300,
}
此外,您的process_vids
方法尚未使用。
让我知道它是否对你有所帮助。