这是我的代码:
import requests
import time
from bs4 import BeautifulSoup as bs
import urllib.request
user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
headers={'User-Agent':user_agent,}
_URL = 'http://papers.xtremepapers.com/CIE/Cambridge%20International%20A%20and%20AS%20Level/Chemistry%20%289701%29/'
r = requests.get(_URL)
soup = bs(r.text)
urls = []
names = []
for i, link in enumerate(soup.findAll('a')):
_FULLURL = _URL + (link.get('href'))
if _FULLURL.endswith('.pdf'):
urls.append(_FULLURL)
names.append(soup.select('a')[i].attrs['href'])
names_urls = zip(names, urls)
for name, url in names_urls:
print (url)
rq = urllib.request.Request(url,None,headers)
res = urllib.request.urlopen(rq)
pdf = open("pdfs/" + (name), 'wb')
pdf.write(res.read())
pdf.close()
print("completed")
pdf正在下载,但是当我打开它们时,我得到了一个
error
PS。我是python的新手所以如果这是一个新手的错误,请原谅我
答案 0 :(得分:0)
我无法确切地说出代码中的错误 - 可能是您实际构建pdf网址的方式(或者至少这是我的第一个赌注) - 但是使用python请求和仅使用根URL(" http://papers.xtremepapers.com")作为基本URL它似乎工作正常(至少内容类型是预期的application / pdf)。以下脚本应工作(减去可能的错别字和诸如此类 - 我没有测试整个脚本,不需要那些pdfs xD)
import requests
from bs4 import BeautifulSoup
ROOT_URL = 'http://papers.xtremepapers.com'
PAGE_URL = ROOT_URL + '/CIE/Cambridge%20International%20A%20and%20AS%20Level/Chemistry%20%289701%29/'
page = requests.get(PAGE_URL)
soup = bs(page.text)
urls = []
for link in soup.findAll('a'):
href = link.get('href')
if not (href and href.endswith('.pdf')):
continue
# builds a working absolute url
url = ROOT_URL + href
# only keeps the filename part as, well, filename
name = href.split('/')[-1]
print("url: {} - name : {}".format(url, name))
try:
r = requests.get(url)
# this will raise if not 200
r.raise_for_status()
# check the content type and raise if not ok
if r.headers["Content-Type"] != "application/pdf":
raise ValueError("unexpected content type '{}'".format(r.headers["Content-Type"]))
except Exception as e:
print("{} failed : {}".format(url, e))
continue
with open("pdfs/" + (name), 'wb') as pdf:
pdf.write(r.content)
print("{} ok".format(name))
print("Done")