用beautifulsoup库替换所有xml文本

时间:2015-06-11 07:55:07

标签: python xml beautifulsoup placeholder

我需要使用Python中的Beautifulsoup库替换xml中的所有文本。例如,我有xml的和平:

<Paragraph>
Procedure general informations
<IntLink Target="il_0_mob_411" Type="MediaObject"/>
<Strong>DIFFICULTY: </Strong>
<IntLink Target="il_0_mob_231" Type="MediaObject"/>
<IntLink Target="il_0_mob_231" Type="MediaObject"/> - 
<Strong>DURATION:</Strong> 15 min.<br/>
<Strong>TOOLS REQUIRED:</Strong> 4mm Allen Key, Pin driver
</Paragraph>

我需要它像这样:

<Paragraph>
0
<IntLink Target="il_0_mob_411" Type="MediaObject"/>
<Strong>1</Strong>
<IntLink Target="il_0_mob_231" Type="MediaObject"/>
<IntLink Target="il_0_mob_231" Type="MediaObject"/> - 
<Strong>2</Strong>3<br/>
<Strong>4</Strong>5
</Paragraph>

谢谢!

1 个答案:

答案 0 :(得分:0)

以下是代码:

# -*- coding: utf-8 -*-
import HTMLParser
import codecs
import os
import sys
from bs4 import BeautifulSoup


xml_doc = open("export_2.xml")
soup = BeautifulSoup(xml_doc)
pars = HTMLParser.HTMLParser()

open('export.txt', 'w').close()
file_xml = open('export_ph.xml', 'w')
counter = 0
all_texts = soup.find_all(text=True)

print "Inizio esportazione:"
for text in all_texts:
    s = pars.unescape(text)
    s = str(counter)+ ";"+ s + "\n"
    if not (s == "" or s.isspace()):
        with codecs.open("export.txt", "a", encoding="utf-8") as file_text:
            file_text.write(s)

    counter = counter+1
    print ".",

## put  placeholder in the xml
all_xml = soup.find_all()
for text in all_xml:
    s = pars.unescape(text.get_text())
    with codecs.open("export_ph.xml", "a", encoding="utf-8") as file_xml:
        file_xml.write(s)


file_xml_info = os.path.getsize('export_ph.xml')
file_txt_info = os.path.getsize('export.txt')
if (file_txt_info > 0 and file_xml_info > 0):
    print "\nEsportazione completata: \nFile xml: " + str(file_xml_info) + "B" + "\nFile testo a 3 colonne: " + str(file_txt_info) + "B"