我尝试下面的代码从链接下载所有pdf文件,但是每次运行这些代码时,它都会下载所有文件。推荐:第一次下载所有pdf,从下次开始仅下载哪个是新的(它应该首先检查哪个是新的)。 我的代码:
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
def getLinks(url):
all_links = []
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for href in soup.find_all(class_='omrlist'):
all_links.append(root_url + href.find('a').get('href'))
return all_links
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
# get report URL
reportLinks = []
for url in yearLinks:
links = getLinks(url)
#reportLinks.extend(links)
#print(reportLinks)
i =0
for url_ in links:
if "AnnualStatisticalSupplement" not in url_:
url__ = url_.replace("org..", "org").replace("../", "")
response = requests.get(url__, stream=True)
lastindex= url__.rfind('/')
strlen = len(url__)
filename = url__[lastindex:strlen]
with open('/home/pdfs/'+ str(filename), 'wb') as pdffile:
pdffile.write(response.content)
i += 1
print(url__)
print("Download Completed")
然后我需要将该文件存储为Mongo DB,我该如何通过创建三列(pdf名称,报告日期,过程标志)来做到这一点。
答案 0 :(得分:0)
对不起,您的代码发生了重大更改。因为您的代码太乱而无法阅读。
如果要下载一段时间以来没有的pdf,则必须添加if-loop
来控制操作。顺便说一句,如果您将页面URL添加到数据库中,则无需再次访问以获得pdf名称。
import requests
from bs4 import BeautifulSoup
root_url = 'https://www.iea.org'
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
downloaded = ["2018-02-13.pdf"] # the latest i have
def getLinks(url):
page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")
links = [root_url + href.a.get('href') for href in li]
return links
def get_pdf(url,flag=1):
# find page link in the month directory
pdf_page = requests.get(url,headers=headers)
soup = BeautifulSoup(pdf_page.text, 'lxml')
li = soup.find_all("li",class_="omrlist")[::-1] # latest -> old
latest_pdf_set = [root_url + href.a.get('href') for href in li]
# find pdf link
pdf_links = []
for pdf_url in latest_pdf_set:
text = requests.get(pdf_url,headers=headers).text
soup = BeautifulSoup(text,"lxml")
link = soup.find("div",class_="omrreport pL10").find("a").get("href")
if link.split("/")[-1] in downloaded:
flag = 0 # if flag = 0 means you found the pdf that you already had
break
pdf_links.append(root_url + link)
return pdf_links,flag
yearLinks = getLinks(root_url +'/oilmarketreport/reports/')
all_ = []
for each in yearLinks:
pdf_links = get_pdf(each)
all_ += pdf_links[0]
if not pdf_links[1]:
# flag = 0 break
break
print(all_)