Question

我正在抓谷歌的pdf文件（白皮书），并希望将它们保存为文件而不是在控制台中列出。

以下是我目前的代码：

import  requests, re
from    docopt import docopt
from    bs4 import BeautifulSoup
from    time import time as timer
from urllib.request import urlopen, urlretrieve, quote
from urllib.parse import urljoin

def get_urls(search_string, start):
    #Empty temp List to store the Urls
    temp        = []
    url         = 'https://www.google.com/search'
    payload     = { 'q' : search_string, 'start' : start }
    my_headers  = { 'User-agent' : 'Mozilla/11.0' }
    r           = requests.get( url, params = payload, headers = my_headers )
    soup        = BeautifulSoup( r.text, 'html.parser' )
    h3tags      = soup.find_all( 'h3', class_='r' )
    for h3 in h3tags:
        try:
            temp.append( re.search('url\?q=(.+?)\&sa', h3.a['href']).group(1) )
        except:
            continue
    return temp

def main():
    start     = timer()
    #Empty List to store the Urls
    result    = []
    arguments = docopt( __doc__, version='MakMan Google Scrapper & Mass Exploiter' )
    search    = arguments['<search>']
    pages     = arguments['<pages>']
    #Calling the function [pages] times.
    for page in range( 0,  int(pages) ):
        #Getting the URLs in the list
        result.extend( get_urls( search, str(page*10) ) )
    #Removing Duplicate URLs
    result    = list( set( result ) )
    print( *result, sep = '\n' )
    print( '\nTotal URLs Scraped : %s ' % str( len( result ) ) ) 
    print( 'Script Execution Time : %s ' % ( timer() - start, ) ) 
if __name__ == '__main__':
    main()


#End

我尝试添加：

with open ('file.txt', 'w') as f:
   print( *result, file=f)

最后将其解析为文件，但我确信在没有先将链接保存到文件的情况下，可以更轻松地下载pdf文件。

Answer 1

如果你有PDF文件的网址，你可以像这样使用urllib.urlretrieve()。这会将文件下载到保存其名称的当前工作目录中。当然，您可以指定您喜欢的任何目标路径。

from os import path
from urllib import urlretrieve
from urlparse import urlparse

src_url = 'http://path/to/document.pdf'
tgt_path = path.split(urlparse(src_url).path)[-1]
urlretrieve(src_url, tgt_path)

从url下载文件（在列表中）

1 个答案: