我想检查csv文件中项目的标题,然后添加到csv文件(如果它不存在)。我几乎搜索了与重复值相关的任何回复。大多数情况下,它们约为DuplicatesPipeline
而其他的则不适合我。
这是我的自定义管道,即pipelines.py
class CheckCsvPipeline(object):
def __init__(self):
csv_path = r"C:\Users\HP\PycharmProjects\ToScrape\book\items.csv"
self.csvfile = open(csv_path, 'r')
self.readCsv = csv.reader(self.csvfile, delimiter=',')
def process_item(self, item, spider):
for row in self.readCsv:
if item['title'] in row:
raise DropItem("This title exists: %s" %item)
else:
return item
这是我的蜘蛛:
import scrapy
class BooksSpider(scrapy.Spider):
name = 'books'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
books = response.xpath('//h3/a/@href').extract()
for book in books:
absolute_url = response.urljoin(book)
yield scrapy.Request(absolute_url, callback=self.parse_book)
# process next page
next_page_url = response.xpath('//a[text()="next"]/@href').extract_first()
absolute_next_page_url = response.urljoin(next_page_url)
yield scrapy.Request(absolute_next_page_url)
def parse_book(self, response):
title = response.css('h1::text').extract_first()
price = response.xpath('//*[@class="price_color"]/text()').extract_first()
yield {'title':title, 'price': price}
我使用以下代码运行蜘蛛,但它仍然添加了现有值。
scrapy crawl books -o items.csv
答案 0 :(得分:0)
我建议您在蜘蛛中维护一个标题列表,然后在管道内部检查标题中是否已存在标题,然后不要from discord.ext import commands
class TestCog:
def __init__(self, bot):
self.bot = bot
self.counter = 0
@commands.command()
async def add(self):
self.counter += 1
await self.bot.say('Counter is now %d' % self.counter)
def setup(bot):
bot.add_cog(TestCog(bot))
它。
yield
在你的蜘蛛中,做到这一点
class CheckCsvPipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
if item['title'] in spider.allTitles:
raise DropItem("This title exists: %s" % item)
else:
return item