我知道这可能看起来像一个已被问及回答的问题,但我问这个问题的原因是因为我对这些问题的答案有疑问。所以我在python中编写了一个脚本,基本上读取包含表的文本文件,并将结果返回到html文件中。我的老板现在要我实现这个脚本,这样我基本上可以在html文件上使用它。由于我已经编写了我的代码以在文本文件上运行,我想从html文件中剥离标记并将文本存储到文本文件中,以便我可以在原始脚本上读取它。这是我到目前为止所得到的:#from numPy import loadtxt
import sys
from urllib.request import urlopen
from html.parser import HTMLParser
class HTMLTextExtractor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.result = [ ]
def handle_data(self, d):
self.result.append(d)
def handle_charref(self, number):
codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
self.result.append(unichr(codepoint))
def handle_entityref(self, name):
codepoint = htmlentitydefs.name2codepoint[name]
self.result.append(unichr(codepoint))
def get_text(self):
return u''.join(self.result)
def html_to_text(html):
s = HTMLTextExtractor()
s.feed(html)
return s.get_text()
# import BeautifulSoup
numCorrect = 0
numWrong = 0
amount9000 = 0
amount540 = 0
amount541 = 0
if9000 = False
if540 = False
if541 = False
ifSuccess = False
newNextLine = True # sometimes the email would be repeated again and
# would mess my code up, so I used this to help
tester9000 = 0
tester540 = 0
tester541 = 0
testerSuccess = 0
tempStr = ""
tempStr2 = ""
temp9000Str = ""
temp9000Str2 = ""
temp541Str = ""
temp541Str2 = ""
# url = "file:///C:/Python34/CID-Sync-0619.html" # reading an html file
Extractfile = open("ExtractFile.txt" , "w")
object = open("CID-Sync-0619.html", "r")
myString = object.read()
# print(sub('<[^<]+?>', '', myString))
Extractfile.write(strip_tags(myString))
Resultfile = open("EndResult.html", "w")
Resultfile.write("<!DOCTYPE html>\n")
Resultfile.write("<html>\n")
Resultfile.write("<body>\n")
Resultfile.write('<table border="5" style="width:1200px">')
Resultfile.write("<tr>") # first row
Resultfile.write(" <td>e-mail</td>")# cells within the row
Resultfile.write(" <td>status</td> ")
Resultfile.write(" <td>CID</td>")
Resultfile.write(" <td>VHM ID</td>")
Resultfile.write(" <td>VHM Name</td>")
Resultfile.write(" <td>Error Message</td>")
Resultfile.write(" <td>SF Account ID</td>")
Resultfile.write(" <td>API Call Time</td>")
Resultfile.write(" <td>Verification Time</td>")
Resultfile.write("</tr>") # end of first row
filename = "testing.txt"
file_object = open(filename, "r")
myList = file_object.readlines()
print("List made")
for line in myList:
if("Verified Success" in line):
testerSuccess = 1
ifSuccess = True
if541 = False
if540 = False
if9000 = False
print ("success")
if("Error Code:540" in line):
tester540 = 1
if540 = True
if541 = False
ifSuccess = False
if9000 = False
print ("Error code 540")
if("Error Code:541" in line):
tester541 = 1
if541 = True
if9000 = False
if540 = False
ifSuccess = False
print("Error code 541")
if("Error Code: 9000" in line):
tester9000 = 1
if9000 = True
if540 = False
if541 = False
ifSuccess = False
print("Error code 9000")
if(ifSuccess):
if("@" in line):
numCorrect = numCorrect + 1
tempList = line.split()
Resultfile.write("<tr>") # row
Resultfile.write(" <td> %s </td>" %tempList[0])
Resultfile.write(" <td> 0 </td>")
testerSuccess = 1000
else:
if(testerSuccess != 1 ):
temp = line.split()
# print (temp)
# print("the length is %d" %len(temp))
if(len(temp)>3):
Resultfile.write(" <td> %s </td>"%temp[0])
Resultfile.write("<td> </td>")
Resultfile.write("<td> </td>")
Resultfile.write("<td> </td>")
Resultfile.write(" <td> %s </td>" %temp[1])
Resultfile.write(" <td> %s </td>"% temp[2])
Resultfile.write(" <td> %s</td>" % temp[3])
Resultfile.write("</tr>") # end of row
if(if540):
if("@" in line):
stopNextForLoop = False
numWrong = numWrong + 1
amount540 = amount540 + 1
tempList2 = line.split("\t")
if("@" not in tempList2[0]):
numWrong = numWrong - 1
amount540 = amount540 - 1
skipRest = True # check if this is True!!!!!!
for items in tempList2:
Resultfile.write("<td> %s" %items)
stopNextForLoop = True
tempStr = tempList2[0]
# print ("1: " +tempStr)
# print("2: "+tempStr2)
if(len(tempList2)>1):
# print("in the first")
if(tempList2[1].lower() in tempStr2.lower() and stopNextForLoop == False):
for items in tempList2:
Resultfile.write("<td> %s" %items)
skipRest = True
# if(tempStr.lower() in tempStr2.lower()):
# numWrong = numWrong - 1
# amount540 = amount540 - 1
# print("in here")
else:
tempStr2 = tempStr
if(skipRest == False):
# print (tempList2)
Resultfile.write("<tr>") # row
Resultfile.write(" <td> %s </td>" %tempList2[0])
Resultfile.write(" <td> 540 </td>")
Resultfile.write(" <td> </td>")
if(len(tempList2)>4):
Resultfile.write(" <td> %s </td>"%tempList2[1])
Resultfile.write(" <td> %s </td>" %tempList2[2])
Resultfile.write(" <td> %s </td>"% tempList2[3])
Resultfile.write(" <td> </td>")
Resultfile.write(" <td> %s</td>" % tempList2[4])
Resultfile.write("<td> </td>")
# Resultfile.write(" <td> %s </td>" % temp[5])
Resultfile.write("</tr>") # end of row
tester540 = 1000
else:
if(tester540 != 1 ):
temp = line.split("\t")
# print (temp)
if(len(temp)>3):
Resultfile.write(" <td> %s </td>"%temp[0])
print("after")
我知道这段代码很长,所以只看一开始。我唯一的问题是我没有办法从html文档中剥离标签。我尝试了很多其他方法,但它似乎没有用。最近我尝试将HTMLParser与内部类一起使用,但是当我运行它时,它会给我一条错误消息说
File "<frozen importlib._bootstrap>", line 1153, in exec
File "<frozen importlib._bootstrap>", line 1129, in _exec
File "<frozen importlib._bootstrap>", line 1471, in exec_module
File "<frozen importlib._bootstrap>", line 321, in _call_with_frames_removed
File "C:\Python34\GetErrors.py", line 70, in <module>
Extractfile.write(strip_tags(myString))
File "C:\Python34\GetErrors.py", line 21, in strip_tags
self.result.append(unichr(codepoint))
File "C:\Python34\lib\html\parser.py", line 165, in feed
self.goahead(0)
File "C:\Python34\lib\html\parser.py", line 198, in goahead
if self.convert_charrefs and not self.cdata_elem:
AttributeError: 'MLStripper' object has no attribute 'convert_charrefs'
答案 0 :(得分:0)
html参数不是html文件,而是html文件包含的字符串或文本。我将html文件作为字符串传递,整个过程完美无缺。