在python中解析并保存html的源代码

时间:2018-07-26 23:41:42

标签: python html parsing

我正在尝试提取一些html链接的内容。当我在单个链接上运行此脚本时,它可以工作,当我尝试从文本文件在多个链接上运行时,收到错误消息。

这是脚本

import bs4
from bs4 import BeautifulSoup
import urllib
from urllib.request import urlopen
import re
import requests
#import xml.etree.ElementTree as ET
from requests.auth import HTTPProxyAuth
import os
import os.path
from general import * 



def parser(link):
    #create the object, assign it to a variable
    proxy = urllib.request.ProxyHandler({
     (I removed it) 
    })
    # construct a new opener using your proxy settings
    opener = urllib.request.build_opener(proxy)
    # install the openen on the module-level
    urllib.request.install_opener(opener)
    #urllib.request.urlretrieve(link, "test.txt")

    # make a request
    #html=open("test.txt","r")
    html=urllib.request.urlretrieve(link)
    soup = BeautifulSoup(html,"html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text


for link in file_to_set('links.txt'):
    text = parser(link)
    path_parts = link.rpartition('/')[2]
    output_file = os.path.join('SOURCE TEXT', path_parts+'.txt')
    with open(output_file, mode='wt',encoding='utf-8') as outfile:
        outfile.write(text)
        print(outfile.name + ' is done')

我收到的错误消息是

ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: ''

或:

declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
TypeError: expected string or buffer

0 个答案:

没有答案