我正在尝试提取一些html链接的内容。当我在单个链接上运行此脚本时,它可以工作,当我尝试从文本文件在多个链接上运行时,收到错误消息。
这是脚本
import bs4
from bs4 import BeautifulSoup
import urllib
from urllib.request import urlopen
import re
import requests
#import xml.etree.ElementTree as ET
from requests.auth import HTTPProxyAuth
import os
import os.path
from general import *
def parser(link):
#create the object, assign it to a variable
proxy = urllib.request.ProxyHandler({
(I removed it)
})
# construct a new opener using your proxy settings
opener = urllib.request.build_opener(proxy)
# install the openen on the module-level
urllib.request.install_opener(opener)
#urllib.request.urlretrieve(link, "test.txt")
# make a request
#html=open("test.txt","r")
html=urllib.request.urlretrieve(link)
soup = BeautifulSoup(html,"html.parser")
# kill all script and style elements
for script in soup(["script", "style"]):
script.extract() # rip it out
# get text
text = soup.get_text()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
return text
for link in file_to_set('links.txt'):
text = parser(link)
path_parts = link.rpartition('/')[2]
output_file = os.path.join('SOURCE TEXT', path_parts+'.txt')
with open(output_file, mode='wt',encoding='utf-8') as outfile:
outfile.write(text)
print(outfile.name + ' is done')
我收到的错误消息是
ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''
或:
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
TypeError: expected string or buffer