我有一个使用scrapy为我编写的网页抓取器。
我希望从刮板正在刮擦的网站上添加一个额外的字段。
在CSV数据库中创建了列标题“描述”,但没有任何内容被抓取。
# -*- coding: utf-8 -*-
import scrapy
from pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
import csv,re
from scrapy import signals
class Rapid7(scrapy.Spider):
name = 'vulns'
allowed_domains = ['rapid7.com']
main_url = 'https://www.rapid7.com/db/?q=&type=nexpose&page={}'
#start_urls = ['https://www.rapid7.com/db/vulnerabilities']
keys = ['Published','CVEID', 'Added', 'Modified', 'Related', 'Severity', 'CVSS', 'Created', 'Solution', 'References', 'Description', 'URL']
def __init__(self):
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
def start_requests(self):
for i in range(1,10):
url = self.main_url.format(i)
yield scrapy.Request(url,callback=self.parse)
def parse(self, response):
flag = True
temp = response.xpath('//div[@class="vulndb__intro-content"]/p/text()').extract_first()
if temp:
if temp.strip()=='An error occurred.':
flag= False
temp = [i for i in response.xpath('//*[@class="results-info"]/parent::div/p/text()').extract()if i.strip()]
if len(temp)==1:
flag= False
if flag:
for article in response.xpath('//*[@class="vulndb__results"]/a/@href').extract():
yield scrapy.Request(response.urljoin(article), callback=self.parse_article, dont_filter=True)
def parse_article(self,response):
item=dict()
item['Published'] = item['Added'] = item['Modified'] = item['Related'] = item['Severity'] = item['Description'] =''
r=response.xpath('//h1[text()="Related Vulnerabilities"]/..//a/@href').extract()
temp = response.xpath('//meta[@property="og:title"]/@content').extract_first()
item['CVEID'] = ''
try:
temp2 = re.search('(CVE-.*-\d*)',temp).groups()[0]
if ":" in temp2:
raise KeyError
except:
try:
temp2 = re.search('(CVE-.*):',temp).groups()[0]
except:
temp2 = ''
if temp2:
item['CVEID'] = temp2.replace(': Important',"").replace(')','')
table = response.xpath('//section[@class="tableblock"]/div')
for row in table:
header = row.xpath('header/text()').extract_first()
data = row.xpath('div/text()').extract_first()
item[header]=data
temp = [i for i in response.xpath('//div[@class="vulndb__related-content"]//text()').extract() if i.strip()]
for ind,i in enumerate(temp):
if "CVE" in i:
temp[ind] = i.replace(' ','')
item['Related']= ", ".join(temp) if temp else ""
temp2= [i for i in response.xpath('//h4[text()="Solution(s)"]/parent::*/ul/li/text()').extract() if i.strip()]
item['Solution'] =", ".join(temp2) if temp2 else ''
temp3 = [i for i in response.xpath('//h4[text()="References"]/parent::*/ul/li/text()').extract() if i.strip()]
item['References'] = ", ".join(temp3) if temp3 else ''
temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/ul/li/text()').extract() if i.strip()]
item['Description'] = ", ".join(temp4) if temp4 else ''
item['URL'] = response.request.url
new_item=dict()
for key in self.keys:
if key not in list(item.keys()):
new_item[key] = ''
else:
new_item[key]=item[key]
yield new_item
def _close(self):
print("Done Scraping")
谢谢
“看来您的帖子大部分是代码;请添加更多详细信息。”抱歉。 :( “看起来您的帖子主要是代码;请添加更多详细信息。”抱歉。 :(
答案 0 :(得分:1)
尝试更换您的temp4
:
temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/ul/li/text()').extract() if i.strip()]
上:
temp4 = [i for i in response.xpath('//h4[text()="Description"]/parent::*/p/text()').extract() if i.strip()]
在<h4>Description</h4>
中,您没有<ul><li>
标签,只有<p>