我需要使用Python中的Beautifulsoup库替换xml中的所有文本。例如,我有xml的和平:
<Paragraph>
Procedure general informations
<IntLink Target="il_0_mob_411" Type="MediaObject"/>
<Strong>DIFFICULTY: </Strong>
<IntLink Target="il_0_mob_231" Type="MediaObject"/>
<IntLink Target="il_0_mob_231" Type="MediaObject"/> -
<Strong>DURATION:</Strong> 15 min.<br/>
<Strong>TOOLS REQUIRED:</Strong> 4mm Allen Key, Pin driver
</Paragraph>
我需要它像这样:
<Paragraph>
0
<IntLink Target="il_0_mob_411" Type="MediaObject"/>
<Strong>1</Strong>
<IntLink Target="il_0_mob_231" Type="MediaObject"/>
<IntLink Target="il_0_mob_231" Type="MediaObject"/> -
<Strong>2</Strong>3<br/>
<Strong>4</Strong>5
</Paragraph>
谢谢!
答案 0 :(得分:0)
以下是代码:
# -*- coding: utf-8 -*-
import HTMLParser
import codecs
import os
import sys
from bs4 import BeautifulSoup
xml_doc = open("export_2.xml")
soup = BeautifulSoup(xml_doc)
pars = HTMLParser.HTMLParser()
open('export.txt', 'w').close()
file_xml = open('export_ph.xml', 'w')
counter = 0
all_texts = soup.find_all(text=True)
print "Inizio esportazione:"
for text in all_texts:
s = pars.unescape(text)
s = str(counter)+ ";"+ s + "\n"
if not (s == "" or s.isspace()):
with codecs.open("export.txt", "a", encoding="utf-8") as file_text:
file_text.write(s)
counter = counter+1
print ".",
## put placeholder in the xml
all_xml = soup.find_all()
for text in all_xml:
s = pars.unescape(text.get_text())
with codecs.open("export_ph.xml", "a", encoding="utf-8") as file_xml:
file_xml.write(s)
file_xml_info = os.path.getsize('export_ph.xml')
file_txt_info = os.path.getsize('export.txt')
if (file_txt_info > 0 and file_xml_info > 0):
print "\nEsportazione completata: \nFile xml: " + str(file_xml_info) + "B" + "\nFile testo a 3 colonne: " + str(file_txt_info) + "B"