我正在尝试抓取该网站,以根据年份保存所有页面上的所有信息。我没有任何错误,但也没有得到详细信息。它必须具有1、2中给定的行。 HTM
有人可以指出我要去哪里了。这是代码:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import csv
file = "Details2.csv"
Headers = ("ID", "ID", "# of Exploits", "Type(s)", "Publish Date", "Update Date", "Score", "Gained Access Level", "Access", "Complexity", "Authentication", "Confiden", "Integr", "Avail")
f = open(file, "w")
csvriter = csv.writer(f, delimiter=',', quotechar='"')
csvriter.writerow(Headers)
for page in range(1,130):
try:
url = "https://www.justfly.com/vulner-list.php?vendor_id=0& product_id=0&version_id=0&page={}&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year=2015&month=0&cweid=0&order=1&trc=6484&sha=f941b721732be362e81064704448767014116e7c".format(page)
#html = urlopen(url)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
bs=BeautifulSoup(webpage, "lxml")
table_body=bs.find('tbody')
rows = table_body.find_all("div", {"class":"srrowns"})
for row in rows:
cols=row.find_all('td')
cols=[x.text.strip() for x in cols]
print("cols", file = f)
f.write("{}".format(cols).replace(",",",",",",",",",","|")+ "\n")
except: AttributeError
f.close()
答案 0 :(得分:1)
尝试以下脚本。它应该获取您上面提到的必需数据。原来您选择了错误的标签名称div
。应该改为tr
。
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.cvedetails.com/vulnerability-list.php?vendor_id=0& product_id=0&version_id=0&page={}&hasexp=0&opdos=0&opec=0&opov=0&opcsrf=0&opgpriv=0&opsqli=0&opxss=0&opdirt=0&opmemc=0&ophttprs=0&opbyp=0&opfileinc=0&opginf=0&cvssscoremin=0&cvssscoremax=0&year=2015&month=0&cweid=0&order=1&trc=6484&sha=f941b721732be362e81064704448767014116e7c"
Headers = ("CVE ID", "CWE ID", "# of Exploits", "Vulnerability Type(s)", "Publish Date", "Update Date", "Score", "Gained Access Level", "Access", "Complexity", "Authentication", "Confidentiality", "Integrity", "Availability")
with open("Details2.csv", "w", newline="") as f:
csvriter = csv.writer(f)
csvriter.writerow(Headers)
for page in range(1,5): #change here the highest number you wanna go across
res = requests.get(url.format(page), headers={'User-Agent':'Mozilla/5.0'})
bs = BeautifulSoup(res.text, "lxml")
for row in bs.find(id='vulnslisttable').find_all("tr", class_="srrowns"):
cols = [x.get_text(strip=True) for x in row.find_all('td')]
print(cols)
csvriter.writerow(cols)