Question

我有一个使用scrapy为我编写的网页抓取器。

我希望从刮板正在刮擦的网站上添加一个额外的字段。

在CSV数据库中创建了列标题“描述”，但没有任何内容被抓取。

# -*- coding: utf-8 -*-
import scrapy
from pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import csv,re
from scrapy import signals
class Rapid7(scrapy.Spider):
    name = 'vulns'
    allowed_domains = ['rapid7.com']
    main_url = 'https://www.rapid7.com/db/?q=&type=nexpose&page={}'
    #start_urls = ['https://www.rapid7.com/db/vulnerabilities']
    keys = ['Published','CVEID', 'Added', 'Modified', 'Related', 'Severity', 'CVSS', 'Created', 'Solution', 'References', 'Description', 'URL']
    def __init__(self):
        SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
    def start_requests(self):
        for i in range(1,10):
            url = self.main_url.format(i)
            yield scrapy.Request(url,callback=self.parse)
    def parse(self, response):
        flag = True
        temp = response.xpath('//div[@class="vulndb__intro-content"]/p/text()').extract_first()
        if temp:
            if temp.strip()=='An error occurred.':
                flag= False
        temp = [i for i in response.xpath('//*[@class="results-info"]/parent::div/p/text()').extract()if i.strip()]
        if len(temp)==1:
            flag= False
        if flag:
            for article in response.xpath('//*[@class="vulndb__results"]/a/@href').extract():
                yield scrapy.Request(response.urljoin(article), callback=self.parse_article, dont_filter=True)

    def parse_article(self,response):
        item=dict()
        item['Published'] = item['Added'] = item['Modified'] = item['Related'] = item['Severity'] = item['Description'] =''
        r=response.xpath('//h1[text()="Related Vulnerabilities"]/..//a/@href').extract()
        temp = response.xpath('//meta[@property="og:title"]/@content').extract_first()
        item['CVEID'] = ''
        try:
            temp2 = re.search('(CVE-.*-\d*)',temp).groups()[0]
            if ":" in temp2:
                raise KeyError
        except:
            try:
                temp2 = re.search('(CVE-.*):',temp).groups()[0]
            except:
                temp2 = ''
        if temp2:
            item['CVEID'] = temp2.replace(': Important',"").replace(')','')
        table = response.xpath('//section[@class="tableblock"]/div')
        for row in table:
            header = row.xpath('header/text()').extract_first()
            data = row.xpath('div/text()').extract_first()
            item[header]=data
        temp = [i for i in response.xpath('//div[@class="vulndb__related-content"]//text()').extract() if i.strip()]
        for ind,i in enumerate(temp):
            if "CVE" in i:
                temp[ind] = i.replace(' ','')

        item['Related']= ", ".join(temp) if temp else ""
        temp2= [i for i in response.xpath('//h4[text()="Solution(s)"]/parent::*/ul/li/text()').extract() if i.strip()]
        item['Solution'] =", ".join(temp2) if temp2 else ''
        temp3 = [i for i in response.xpath('//h4[text()="References"]/parent::*/ul/li/text()').extract() if i.strip()]
        item['References'] = ", ".join(temp3) if temp3 else ''
        temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/ul/li/text()').extract() if i.strip()]
        item['Description'] = ", ".join(temp4) if temp4 else ''
        item['URL'] = response.request.url
        new_item=dict()
        for key in self.keys:
            if key not in list(item.keys()):
                new_item[key] = ''
            else:
                new_item[key]=item[key]
        yield new_item

    def _close(self):
        print("Done Scraping")

谢谢

“看来您的帖子大部分是代码；请添加更多详细信息。”抱歉。 :( “看起来您的帖子主要是代码；请添加更多详细信息。”抱歉。：（

Answer 1

尝试更换您的temp4：

temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/ul/li/text()').extract() if i.strip()]

上：

temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/p/text()').extract() if i.strip()]

在<h4>Description</h4>中，您没有<ul><li>标签，只有<p>

刮板未刮除“说明”字段

1 个答案: