我的爬虫爬网程序从一组URL中收集数据,但是当我再次运行它以添加新内容时,旧内容将保存到我的Mongodb数据库中。有没有一种方法可以检查我的Mongodb数据库中是否已找到该项目(重复的项目具有相同的标题字段),如果存在,请将其从管道中删除。另外,最好将它们保存后从数据库中删除,如果这样,我将如何在我的项目中实现它。
这是我的管道:
import logging
import pymongo
from scrapy.exceptions import DropItem
class MongoPipeline(object):
collection_name = 'articles'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
## pull in information from settings.py
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
## initializing spider
## opening db connection
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
## clean up when spider is closed
self.client.close()
def process_item(self, item, spider):
## how to handle each post
bbcDict = {}
if item['art_content'] != []:
bbcDict['art_content'] = item['art_content']
bbcDict['date'] = item['date']
bbcDict['date_str'] = item['date_str']
bbcDict['title'] = item['title']
bbcDict['url'] = item['url']
self.db[self.collection_name].insert_one(dict(bbcDict))
return item
# self.db[self.collection_name].insert(dict(item))
# logging.debug("Post added to MongoDB")
# return item
这是我的爬虫
from datetime import datetime as dt
import scrapy
from ArtScraper.items import ArtscraperItem
class PostSpider(scrapy.Spider):
article = ""
name = 'crawly'
allowed_domains = []
start_urls = ['http://feeds.bbci.co.uk/arabic/rss.xml']
def parse(self, response):
# get the subreddit from the URL
#sub = response.url.split('/')[4]
#Get the title
# parse thru each of the posts
#for post in response.css('div.thing'):
articles = response.xpath('//channel/item')
for article in articles:
item = ArtscraperItem()
print ('hello')
item['date'] = dt.today()
item['date_str'] = article.xpath('pubDate/text()').extract_first()
item['url'] = article.xpath('link/text()').extract_first()
item['title'] = article.xpath('title/text()').extract_first()
url = item['url']
yield scrapy.Request(
url,
callback=self.parse_article,
meta={'item': item}, # carry over our item
)
#request = scrapy.Request(url, callback=self.parse_article)
#request.meta['item'] = item
#yield request
def parse_article(self, response):
item = response.meta['item']
pars = response.xpath("//div[@class='story-body']/div[@class='story-body__inner']/p/text()").extract()
item['art_content'] = '-'.join(pars)
print ("HHHH")
yield item
谢谢。
答案 0 :(得分:0)
在处理项目时,可以在NSDirectionalEdgeInsets(top: -23.0, leading: -23.0, bottom: -23.0, trailing: -23.0)
类上创建titles
的列表,以过滤出重复项,并在MongoPipeline
期间使用DropItem
删除项目。 official docs提供了一个很好的例子。然后,您可以在返回项目时将其保存到MongoDB。
在这种情况下,这将是管道中重复项过滤器的实现:
process_items