无法使用此python脚本下载多个文件

时间:2019-05-21 15:43:10

标签: python pdf web-scraping

此脚本使用漂亮的汤来解析网站特定页面上的所有pdf文档。该脚本成功下载了一个文件,但不会下载所有返回的文件。我需要帮助来下载我已经解析的所有pdf文档。

我已经做过研究,但没有找到答案

import requests
from bs4 import BeautifulSoup 
import html5lib
import lxml

#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx?    portalId=895956&pageId=1606144')
RFP_Import =     ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')

def get_pdf_links():
    r = requests.get(RFP_Import)
    soup= BeautifulSoup(r.content, 'html5lib')
    links = soup.find_all('a')
    pdf_links = [place_hoder + link['href'] for link in links if     link['href'].endswith('pdf')]
    return pdf_links



def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
        print ('all RFPs downloaded!')
        return 

if __name__ == "__main__":
        pdf_links = get_pdf_links()
        download_pdf_links(pdf_links)

成功下载第一个pdf文档,然后停止。

import requests
from bs4 import BeautifulSoup 
import html5lib
import lxml

#RFP_Import = ('http://www.staffordmsd.org/cms/One.aspx?       portalId=895956&pageId=1606144')
RFP_Import =     ('http://www.staffordmsd.org/departments/business_operations/bids_and_proposals')
place_hoder = ('http://www.staffordmsd.org')

def get_pdf_links():
    r = requests.get(RFP_Import)
    soup= BeautifulSoup(r.content, 'html5lib')
    links = soup.find_all('a')
    pdf_links = [place_hoder + link['href'] for link in links if     link['href'].endswith('pdf')]
return pdf_links



def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
        print ('all RFPs downloaded!')
        return 

if __name__ == "__main__":
        pdf_links = get_pdf_links()
        download_pdf_links(pdf_links)

2 个答案:

答案 0 :(得分:3)

download_pdf_links()内部,return未对齐。它应与for对齐。否则,它是for周期的一部分,并且函数在第一次迭代后终止。

print ('all RFPs downloaded!')可能也是如此。我想您希望在通过所有链接后在for循环结束时打印出来。

答案 1 :(得分:1)

download_pdf_link中,您正在循环内使用return,该循环将在循环的第一次迭代后返回并停止下载文件。您需要在循环结束后通过将其放在与循环开始相同的缩进处来返回,如下所示:

def download_pdf_links (pdf_links):
    for link in pdf_links:
        file_name = link.split("/")[-1]
        print ("Downloading file:%s"%file_name)
        r = requests.get(link, stream = True)
        with open(file_name, 'wb') as f:
            for chunk in r.iter_content(chunk_size = 1024*1024):
                if chunk:
                    f.write(chunk)
        print ('%s downloaded!\n'%file_name)
    # Un-indented so it happens after the loop finishes.
    print ('all RFPs downloaded!')
    return