Question

我正在尝试使用Scrapy / Python编写一个爬虫程序，它从页面中读取一些值。

然后我想让这个抓取工具在单独的字段中存储最高和最低值。

到目前为止，我能够从页面中读取值（请参阅下面的代码），但我不确定如何计算最低值和最高值并存储在单独的字段中？

例如，假设抓取工具读取页面并返回这些值

burvale-score = 75.25
richmond-score = 85.04
somano-score =''（缺少值）
tucson-score = 90.67
cloud-score = 50.00

所以我想填充......

'highestscore'：90.67
'lowestscore'：50.00

我该怎么做？我需要使用数组吗？将所有值放在数组中，然后选择最高/最低？

另请注意，我的代码中有2个yield ....底部yield正在提供抓取的网址，第一个yield实际抓取/从底部yield

提供的每个网址中收集值

非常感谢任何帮助。如果可以，请提供代码示例。

到目前为止，这是我的代码 ....如果缺少值，我会存储-1。

class MySpider(BaseSpider):
    name = "courses"
    start_urls = ['http://www.example.com/all-courses-listing']
    allowed_domains = ["example.com"]
    def parse(self, response):
     hxs = Selector(response)
    #for courses in response.xpath(response.body):
     for courses in response.xpath("//meta"):
     yield {
                'pagetype': courses.xpath('//meta[@name="pagetype"]/@content').extract_first(),
                'pagefeatured': courses.xpath('//meta[@name="pagefeatured"]/@content').extract_first(),
                'pagedate': courses.xpath('//meta[@name="pagedate"]/@content').extract_first(),
                'pagebanner': courses.xpath('//meta[@name="pagebanner"]/@content').extract_first(),
                'pagetitle': courses.xpath('//meta[@name="pagetitle"]/@content').extract_first(),
                'pageurl': courses.xpath('//meta[@name="pageurl"]/@content').extract_first(),
                'pagedescription': courses.xpath('//meta[@name="pagedescription"]/@content').extract_first(),
                'pageid': courses.xpath('//meta[@name="pageid"]/@content').extract_first(),

                'courseatarburvale': float(courses.xpath('//meta[@name="courseatar-burvale"]/@content').extract_first('').strip() or -1),
                'courseatarrichmond': float(courses.xpath('//meta[@name="courseatar-richmond"]/@content').extract_first('').strip() or -1),
                'courseatarsomano': float(courses.xpath('//meta[@name="courseatar-somano"]/@content').extract_first('').strip() or -1),
                'courseatartucson': float(courses.xpath('//meta[@name="courseatar-tucson"]/@content').extract_first('').strip() or -1),
                'courseatarcloud': float(courses.xpath('//meta[@name="courseatar-cloud"]/@content').extract_first('').strip() or -1),
                'highestscore'; ??????
                'lowestscore'; ??????
               }
     for url in hxs.xpath('//ul[@class="scrapy"]/li/a/@href').extract():
      yield Request(response.urljoin(url), callback=self.parse)

Answer 1

我可能会分解这部分代码：

yield {
    'pagetype': courses.xpath('//meta[@name="pagetype"]/@content').extract_first(),
    'pagefeatured': courses.xpath('//meta[@name="pagefeatured"]/@content').extract_first(),
    'pagedate': courses.xpath('//meta[@name="pagedate"]/@content').extract_first(),
    'pagebanner': courses.xpath('//meta[@name="pagebanner"]/@content').extract_first(),
    'pagetitle': courses.xpath('//meta[@name="pagetitle"]/@content').extract_first(),
    'pageurl': courses.xpath('//meta[@name="pageurl"]/@content').extract_first(),
    'pagedescription': courses.xpath('//meta[@name="pagedescription"]/@content').extract_first(),
    'pageid': courses.xpath('//meta[@name="pageid"]/@content').extract_first(),

    'courseatarburvale': float(courses.xpath('//meta[@name="courseatar-burvale"]/@content').extract_first('').strip() or -1),
    'courseatarrichmond': float(courses.xpath('//meta[@name="courseatar-richmond"]/@content').extract_first('').strip() or -1),
    'courseatarsomano': float(courses.xpath('//meta[@name="courseatar-somano"]/@content').extract_first('').strip() or -1),
    'courseatartucson': float(courses.xpath('//meta[@name="courseatar-tucson"]/@content').extract_first('').strip() or -1),
    'courseatarcloud': float(courses.xpath('//meta[@name="courseatar-cloud"]/@content').extract_first('').strip() or -1),
    'highestscore'; ??????
    'lowestscore'; ??????
}

进入这个：

item = {
    'pagetype': courses.xpath('//meta[@name="pagetype"]/@content').extract_first(),
    'pagefeatured': courses.xpath('//meta[@name="pagefeatured"]/@content').extract_first(),
    'pagedate': courses.xpath('//meta[@name="pagedate"]/@content').extract_first(),
    'pagebanner': courses.xpath('//meta[@name="pagebanner"]/@content').extract_first(),
    'pagetitle': courses.xpath('//meta[@name="pagetitle"]/@content').extract_first(),
    'pageurl': courses.xpath('//meta[@name="pageurl"]/@content').extract_first(),
    'pagedescription': courses.xpath('//meta[@name="pagedescription"]/@content').extract_first(),
    'pageid': courses.xpath('//meta[@name="pageid"]/@content').extract_first(),
}

scores = {
    'courseatarburvale': float(courses.xpath('//meta[@name="courseatar-burvale"]/@content').extract_first('').strip() or -1),
    'courseatarrichmond': float(courses.xpath('//meta[@name="courseatar-richmond"]/@content').extract_first('').strip() or -1),
    'courseatarsomano': float(courses.xpath('//meta[@name="courseatar-somano"]/@content').extract_first('').strip() or -1),
    'courseatartucson': float(courses.xpath('//meta[@name="courseatar-tucson"]/@content').extract_first('').strip() or -1),
    'courseatarcloud': float(courses.xpath('//meta[@name="courseatar-cloud"]/@content').extract_first('').strip() or -1),
}

values = sorted(x for x in scores.values() if x > 0)
scores.update({
    'highestscore': values[-1],
    'lowestscore': values[0],
})

item.update(scores)
yield item

Scrapy / Python：以收益率处理值

1 个答案: