此脚本使用漂亮的汤来解析网站特定页面上的所有pdf文档。该脚本成功下载了一个文件,但不会下载所有返回的文件。我需要帮助来下载我已经解析的所有pdf文档。
我已经做过研究,但没有找到答案
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
成功下载第一个pdf文档,然后停止。
import requests
from bs4 import BeautifulSoup
import html5lib
import lxml
#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx? portalId=895956&pageId=1606144')
RFP_Import = ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')
def get_pdf_links():
r = requests.get(RFP_Import)
soup= BeautifulSoup(r.content, 'html5lib')
links = soup.find_all('a')
pdf_links = [place_hoder + link['href'] for link in links if link['href'].endswith('pdf')]
return pdf_links
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
print ('all RFPs downloaded!')
return
if __name__ == "__main__":
pdf_links = get_pdf_links()
download_pdf_links(pdf_links)
答案 0 :(得分:3)
在download_pdf_links()
内部,return
未对齐。它应与for
对齐。否则,它是for
周期的一部分,并且函数在第一次迭代后终止。
print ('all RFPs downloaded!')
可能也是如此。我想您希望在通过所有链接后在for循环结束时打印出来。
答案 1 :(得分:1)
在download_pdf_link
中,您正在循环内使用return
,该循环将在循环的第一次迭代后返回并停止下载文件。您需要在循环结束后通过将其放在与循环开始相同的缩进处来返回,如下所示:
def download_pdf_links (pdf_links):
for link in pdf_links:
file_name = link.split("/")[-1]
print ("Downloading file:%s"%file_name)
r = requests.get(link, stream = True)
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
f.write(chunk)
print ('%s downloaded!\n'%file_name)
# Un-indented so it happens after the loop finishes.
print ('all RFPs downloaded!')
return