无法使用urllib和美味汤下载PDF,403禁止使用?

时间:2017-03-03 10:32:54

标签: python beautifulsoup urllib bs4

我试图使用python脚本从网站下载pdf我遇到了这段代码http://pastebin.com/nRVXgmqF但它是用python 2编写的我把它改成3,我不知道为什么这不起作用我会很感激帮助谢谢!

import urllib.parse
import urllib.request
import urllib.error 
import os
import sys

try:
        from bs4 import BeautifulSoup
except ImportError:
        print ("[*] Please download and install Beautiful Soup first!")
        sys.exit(0)

url = input("[+] Enter the url: ")
download_path = input("[+] Enter the download path in full: ")

#try:
#to make it look legit for the url
headers = {'User-Agent':'Mozilla/5.0'}

i = 0

request = urllib.request.Request(url, None, headers)
html = urllib.request.urlopen(request)
soup = BeautifulSoup(html.read(),"html.parser") #to parse the website

for tag in soup.findAll('a', href=True): #find <a> tags with href in it so you know it is for urls
        #so that if it doesn't contain the full url it can the url itself to it for the download
        tag['href'] = urllib.parse.urljoin(url, tag['href'])

        #this is pretty easy we are getting the extension (splitext) from the last name of the full url(basename)
        #the spiltext splits it into the filename and the extension so the [1] is for the second part(the extension)
        if os.path.splitext(os.path.basename(tag['href']))[1] == '.pdf':
                current = urllib.request.urlopen(tag['href'])
                print ("\n[*] Downloading: %s" %(os.path.basename(tag['href'])))

                f = open(download_path + "\\" +os.path.basename(tag['href'], "wb"))
                f.write(current.read())
                f.close()
                i+=1

print ("\n[*] Downloaded %d files" %(i+1))
input("[+] Press any key to exit...")

#except KeyboardInterrupt:
#        print( "[*] Exiting...")
#        sys.exit(1)
# 
#except urllib.error.URLError as e:
#        print ("[*] Could not get information from server!!")
#        sys.exit(2)
# 
#except:
#        print ("I don't know the problem but sorry!!")
#        sys.exit(3)

我收到此错误

Traceback (most recent call last):
  File "C:\Users\random\Desktop\Pastpapers\downpp.py", line 33, in <module>
    current = urllib.request.urlopen(tag['href'])
  File "C:\Python35\lib\urllib\request.py", line 163, in urlopen
    return opener.open(url, data, timeout)
  File "C:\Python35\lib\urllib\request.py", line 472, in open
    response = meth(req, response)
  File "C:\Python35\lib\urllib\request.py", line 582, in http_response
    'http', request, response, code, msg, hdrs)
  File "C:\Python35\lib\urllib\request.py", line 510, in error
    return self._call_chain(*args)
  File "C:\Python35\lib\urllib\request.py", line 444, in _call_chain
    result = func(*args)
  File "C:\Python35\lib\urllib\request.py", line 590, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden

发生了什么事?

我无法使上述代码生效!我意识到错误来自这一行current = urllib.request.urlopen(tag['href'])所以我决定使用请求库来下载它。

我的新工作代码!

import urllib.request
import urllib.parse
from requests import get  # to make GET request
import os
from bs4 import BeautifulSoup

url           = input("[+] Enter the url: ")
download_path = input("[+] Enter the download path in full: ")
headers       = {'User-Agent':'Mozilla/5.0'}
request       = urllib.request.Request(url, None, headers)
html          = urllib.request.urlopen(request)
soup          = BeautifulSoup(html.read(),"html.parser") #to parse the website

for tag in soup.findAll('a', href=True):
    tag['href'] = urllib.parse.urljoin(url, tag['href']).replace(" ", "%20")
    if os.path.splitext(os.path.basename(tag['href']))[1] == '.pdf':
        # open in binary mode
        a = download_path + "\\" +os.path.basename(tag['href'])
        f = open(a, "wb")
        response = get(tag['href'])
        # write to file
        f.write(response.content)
        f.close()             

0 个答案:

没有答案