我有这个scrapy蜘蛛
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from items import QuestionItem
class FirstSpider(scrapy.Spider):
name = 'first'
allowed_domains = ['stackoverflow.com']
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
selector_list = response.css('.question-summary')
for selector in selector_list:
item = QuestionItem()
item['question'] = selector.css('h3 a::text').extract()
item['votes'] = selector.css('.vote-count-post strong::text').extract()
item['answers'] = selector.css('.status strong::text').extract()
item['views'] = selector.css('.views ::text').extract()[0].replace('\n','').replace('\r','').lstrip()
item['username'] = selector.css('.user-details a::text').extract()
item['userlink'] = selector.css('.user-details a::attr(href)').extract()
return item
此代码也在items.py
中import scrapy
class QuestionItem(scrapy.Item):
question = scrapy.Field()
votes = scrapy.Field()
answers = scrapy.Field()
views = scrapy.Field(serializer=str)
username = scrapy.Field()
userlink = scrapy.Field()
它应该从stackoverflow的默认问题页面开始,并使用css标记获取所有问题。
但是,在使用此命令时,它只将一行保存到csv。
scrapy crawl first --output file.csv
答案 0 :(得分:1)
return
方法中有parse
语句,执行后函数终止。
您应该使用yield
代替return
,而且也应该在for循环的范围内使用。
class FirstSpider(scrapy.Spider):
name = 'first'
allowed_domains = ['stackoverflow.com']
start_urls = ['https://stackoverflow.com/questions']
def parse(self, response):
selector_list = response.css('.question-summary')
for selector in selector_list:
item = QuestionItem()
item['question'] = selector.css('h3 a::text').extract()
item['votes'] = selector.css('.vote-count-post strong::text').extract()
item['answers'] = selector.css('.status strong::text').extract()
item['views'] = selector.css('.views ::text').extract()[0].replace('\n','').replace('\r','').lstrip()
item['username'] = selector.css('.user-details a::text').extract()
item['userlink'] = selector.css('.user-details a::attr(href)').extract()
yield item