Scrapy:不收集所有页面的数据

时间:2017-07-01 20:30:30

标签: python scrapy

请帮助了解错误是什么。 转到页面... /?start = 0,/?start = 25,/?start = 50 仅从最后一页收集数据(50)。 我的代码:

from scrapy import FormRequest
from scrapy import Request
import scrapy
from scrapy.spiders import CrawlSpider

from ..items import GetDomainsItem


def pages_range(start, step):
    stop = 50
    r = start
    while r <= stop:
        yield r
        r += step

class GetUrlDelDomSpider(CrawlSpider):
    name = 'get_domains'

    allowed_domains = ["member.expireddomains.net"]

    paginate = pages_range(0, 25)

    start_urls = list(map(lambda i: 'https://member.expireddomains.net/domains/expiredcom201612/?start=%s' % i, paginate))
    def start_requests(self):
        for start_url in self.start_urls:
            yield Request(start_url, dont_filter=True)

    def parse(self, response):
        yield FormRequest.from_response(response,
                                        formnumber=1,
                                        formdata={'login': 'xxx', 'password': '*****', 'rememberme': '1'},
                                        callback=self.parse_login,
                                        dont_filter=True)
    def parse_login(self, response):
       if b'The supplied login information are unknown.' not in response.body:
          item = GetDomainsItem()
          for each in response.selector.css('table.base1 tbody '):
              item['domain'] = each.xpath('tr/td[@class="field_domain"]/a/text()').extract()
              return item

感谢您的帮助。

1 个答案:

答案 0 :(得分:2)

return item方法中的

parse_login打破了循环:

for each in response.selector.css('table.base1 tbody '):
    item['domain'] = each.xpath('tr/td[@class="field_domain"]/a/text()').extract()
    return item
      ^

所以你应该在你的循环的每次迭代中创建一个项目并 yield

for each in response.selector.css('table.base1 tbody '):
    item = GetDomainsItem()
    item['domain'] = each.xpath('tr/td[@class="field_domain"]/a/text()').extract()
    yield item