剥离html标签

时间:2014-06-24 19:01:10

标签: html python-3.x

我知道这可能看起来像一个已被问及回答的问题,但我问这个问题的原因是因为我对这些问题的答案有疑问。所以我在python中编写了一个脚本,基本上读取包含表的文本文件,并将结果返回到html文件中。我的老板现在要我实现这个脚本,这样我基本上可以在html文件上使用它。由于我已经编写了我的代码以在文本文件上运行,我想从html文件中剥离标记并将文本存储到文本文件中,以便我可以在原始脚本上读取它。这是我到目前为止所得到的:#from numPy import loadtxt

import sys
from urllib.request import urlopen
from html.parser import HTMLParser

class HTMLTextExtractor(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.result = [ ]

    def handle_data(self, d):
        self.result.append(d)

    def handle_charref(self, number):
        codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
    self.result.append(unichr(codepoint))

    def handle_entityref(self, name):
        codepoint = htmlentitydefs.name2codepoint[name]
        self.result.append(unichr(codepoint))

    def get_text(self):
        return u''.join(self.result)

def html_to_text(html):
    s = HTMLTextExtractor()
    s.feed(html)
    return s.get_text()

# import BeautifulSoup
numCorrect = 0
numWrong = 0
amount9000 = 0
amount540 = 0
amount541 = 0
if9000 = False
if540 = False
if541 = False
ifSuccess = False

newNextLine = True # sometimes the email would be repeated again and 
                # would mess my code up, so I used this to help
tester9000 = 0
tester540 = 0
tester541 = 0
testerSuccess = 0
tempStr = ""
tempStr2 = ""
temp9000Str = ""
temp9000Str2 = ""
temp541Str = ""
temp541Str2 = ""

# url = "file:///C:/Python34/CID-Sync-0619.html"    # reading an html file



Extractfile = open("ExtractFile.txt" , "w")
object = open("CID-Sync-0619.html", "r")
myString = object.read()
# print(sub('<[^<]+?>', '', myString))
Extractfile.write(strip_tags(myString))

Resultfile = open("EndResult.html", "w")
Resultfile.write("<!DOCTYPE html>\n")

Resultfile.write("<html>\n")
Resultfile.write("<body>\n")

Resultfile.write('<table border="5" style="width:1200px">')
Resultfile.write("<tr>")   # first row
Resultfile.write("  <td>e-mail</td>")# cells within the row
Resultfile.write("  <td>status</td> ")
Resultfile.write("  <td>CID</td>")
Resultfile.write("  <td>VHM ID</td>")
Resultfile.write("  <td>VHM Name</td>")
Resultfile.write("  <td>Error Message</td>")
Resultfile.write("  <td>SF Account ID</td>")
Resultfile.write("  <td>API Call Time</td>")
Resultfile.write("  <td>Verification Time</td>")

Resultfile.write("</tr>")  # end of first row





filename = "testing.txt"

file_object = open(filename, "r")
myList = file_object.readlines()
print("List made")

for line in myList:
    if("Verified Success" in line):
    testerSuccess = 1
    ifSuccess = True
    if541 = False
    if540 = False
    if9000 = False
    print ("success")
if("Error Code:540" in line):
    tester540 = 1
    if540 = True
    if541 = False
    ifSuccess = False
    if9000 = False
    print ("Error code 540")
if("Error Code:541" in line):
    tester541 = 1
    if541 = True
    if9000 = False
    if540 = False
    ifSuccess = False
    print("Error code 541")
if("Error Code: 9000" in line):
    tester9000 = 1
    if9000 = True
    if540 = False
    if541 = False
    ifSuccess = False
    print("Error code 9000")
if(ifSuccess):
    if("@" in line):
        numCorrect = numCorrect + 1
        tempList = line.split()
        Resultfile.write("<tr>")  # row
        Resultfile.write("  <td> %s </td>" %tempList[0])
        Resultfile.write("  <td> 0 </td>")
        testerSuccess = 1000
    else:
        if(testerSuccess != 1 ):
            temp = line.split()
            # print (temp)
            # print("the length is %d" %len(temp))
            if(len(temp)>3):
                Resultfile.write("  <td> %s </td>"%temp[0])
                Resultfile.write("<td> </td>")
                Resultfile.write("<td> </td>")
                Resultfile.write("<td> </td>") 
                Resultfile.write("  <td> %s </td>" %temp[1])
                Resultfile.write("  <td> %s </td>"% temp[2])
                Resultfile.write("  <td> %s</td>" % temp[3])
                Resultfile.write("</tr>") # end of row

if(if540):
    if("@" in line):
        stopNextForLoop = False
        numWrong = numWrong + 1
        amount540 = amount540 + 1
        tempList2 = line.split("\t")
        if("@" not in tempList2[0]):
            numWrong = numWrong - 1
            amount540 = amount540 - 1
            skipRest = True # check if this is True!!!!!!
            for items in tempList2:
                Resultfile.write("<td> %s" %items)
                stopNextForLoop = True
        tempStr = tempList2[0]
        # print ("1: " +tempStr)
        # print("2: "+tempStr2)
        if(len(tempList2)>1):
            # print("in the first")
            if(tempList2[1].lower() in tempStr2.lower() and stopNextForLoop == False):
                for items in tempList2:
                    Resultfile.write("<td> %s" %items)
                    skipRest = True
        # if(tempStr.lower() in tempStr2.lower()):
            # numWrong = numWrong - 1
            # amount540 = amount540 - 1
            # print("in here")
        else:
            tempStr2 = tempStr

        if(skipRest == False):

            # print (tempList2)
            Resultfile.write("<tr>")  # row
            Resultfile.write("  <td> %s </td>" %tempList2[0])
            Resultfile.write("  <td> 540 </td>")
            Resultfile.write("  <td> </td>")
            if(len(tempList2)>4):
                Resultfile.write("  <td> %s </td>"%tempList2[1])
                Resultfile.write("  <td> %s </td>" %tempList2[2])
                Resultfile.write("  <td> %s </td>"% tempList2[3])
                Resultfile.write("  <td> </td>")
                Resultfile.write("  <td> %s</td>" % tempList2[4])
                Resultfile.write("<td> </td>")
                # Resultfile.write("    <td> %s </td>" % temp[5])
                Resultfile.write("</tr>") # end of row

            tester540 = 1000
    else:
        if(tester540 != 1 ):
            temp = line.split("\t")
            # print (temp)
            if(len(temp)>3):
                Resultfile.write("  <td> %s </td>"%temp[0])
print("after")

我知道这段代码很长,所以只看一开始。我唯一的问题是我没有办法从html文档中剥离标签。我尝试了很多其他方法,但它似乎没有用。最近我尝试将HTMLParser与内部类一起使用,但是当我运行它时,它会给我一条错误消息说

File "<frozen importlib._bootstrap>", line 1153, in exec
  File "<frozen importlib._bootstrap>", line 1129, in _exec
  File "<frozen importlib._bootstrap>", line 1471, in exec_module
  File "<frozen importlib._bootstrap>", line 321, in _call_with_frames_removed
  File "C:\Python34\GetErrors.py", line 70, in <module>
    Extractfile.write(strip_tags(myString))
  File "C:\Python34\GetErrors.py", line 21, in strip_tags
    self.result.append(unichr(codepoint))
  File "C:\Python34\lib\html\parser.py", line 165, in feed
self.goahead(0)
File "C:\Python34\lib\html\parser.py", line 198, in goahead
if self.convert_charrefs and not self.cdata_elem:
AttributeError: 'MLStripper' object has no attribute 'convert_charrefs'

1 个答案:

答案 0 :(得分:0)

html参数不是html文件,而是html文件包含的字符串或文本。我将html文件作为字符串传递,整个过程完美无缺。