我的黑客新闻蜘蛛在一行上输出所有结果,而不是每行一行,这里可以看到。
这是我的代码。
import scrapy
import string
import urlparse
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors import LinkExtractor
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = response
selector_list = response.xpath('.//table[@class="itemlist"]')
for sel in selector_list:
item = HnItem()
item['title'] = sel.xpath('.//td[@class="title"]/text()').extract()
item['link'] = sel.xpath('.//tr[@class="athing"]/td[3]/a/@href').extract()
item['score'] = sel.xpath('.//td[@class="subtext"]/span/text()').extract()
yield item
和我的settings.py文件
BOT_NAME = 'hnews'
SPIDER_MODULES = ['hnews.spiders']
NEWSPIDER_MODULE = 'hnews.spiders'
USER_AGENT = 'hnews (+http://www.yourdomain.com)'
FEED_URI = '/used/scrapy/hnews/%(name)s/%(time)s.csv'
FEED_FORMAT = 'csv'
我试图在其他许多解决方案中实施this但到目前为止还没有运气。我现在还很新,所以如果可能请耐心等待。
答案 0 :(得分:0)
这种情况正在发生,因为您的项目管道正在立即获取所有列表。对于示例:item['title']
一次获取所有标题的列表,然后将其传输到项目管道,然后直接写入csv文件。
解决方案是迭代列表并一次一个地将其输出到项目管道。这是修改后的代码:
import scrapy
from scrapy.selector import Selector
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = Selector(response)
item = HnItem()
title_list = sel.xpath('.//td[@class="title"]/a/text()').extract()[:-2]
link_list= sel.xpath('.//tr[@class="athing"]/td[3]/a/@href').extract()
score_list = sel.xpath('.//td[@class="subtext"]/span/text()').extract()
for x in range(0,len(title_list)):
item['title'] = title_list[x]
item['link'] = link_list[x]
item['score'] = score_list[x]
yield item