根据csv列替换xml值

时间:2021-07-14 08:40:35

标签: python python-3.x xml python-2.7 csv

我正在尝试根据 csv 列替换 xml 文件中的类名。实际上 xml 文件是注释文件。

这是xml的格式:

<annotation>
<folder>./test_xmls</folder>
<filename>000048_Panorama.jpg</filename>
<path>./images000048_Panorama.jpg</path>
<source>
<database>Unknown</database>
</source>
<size>
<width>4000</width>
<height>2000</height>
<depth>3</depth>
</size>
<segmented>0</segmented>
<object>
<name>AAAA</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox>

我的 csv 包含 original 列并更改为`列。

格式为:

|original | change to|
-------------------------- 
| AAAA  | class_A |

..................

Csv 有 20000 多行,其中包括 80000 个 xml 文件的所有 <name>AAAA</name>

我想将 xml 名称(如 AAAA)与 csv 列匹配。如果它存在于 original 列中,那么我想用 change to 中的相应值替换,例如 AAAAclass_A

我尝试编写 python 代码,但它不起作用。 我的代码在这里

import xml.etree.ElementTree as ET
import os
import pandas as pd
from collections import defaultdict
import csv
from csv import reader


with open('table.csv', mode='r') as inp:
    reader = csv.reader(inp)
    dict_from_csv = {rows[0]:rows[2] for rows in reader}

#print(dict_from_csv)

root_path = "./xmls"

xml_list = sorted(os.listdir(root_path))

for xml_file in xml_list:
    xml_path = os.path.join(root_path,xml_file)
    # parse xml file
    tree = ET.parse(xml_path)
    # get root node
    root = tree.getroot()
    for member in root.findall('object'):
        sub_child = member[0].text
        print(sub_child)
    for key, value in dict_from_csv.items():
        if sub_child in key:
            sub_child = sub_child.replace(sub_child, value)
            #print(xml)
        xml_file.write(sub_child)  
        print("Classes are changed : " + xml_path)

任何帮助将不胜感激。

谢谢

2 个答案:

答案 0 :(得分:1)

以下代码应该可以满足您的要求:

import lxml.html   # check https://pypi.org/project/lxml/
from csv import reader
from os.path import exists
import glob


def update_xml(path: str) -> None:
    with open('./convertions.csv', 'r') as convertions, open(path, 'r') as annotation:  # noqa: E501
        tree = lxml.html.fromstring(annotation.read())
        csv_reader = reader(convertions)

        for idx, row in enumerate(csv_reader, start=1):
            if idx == 1:
                continue

            original, change_to = row

            tags = tree.xpath(f".//name[text()='{original}']")

            for tag in tags:
                tag.text = change_to

                print(f'Changed class {original} to {change_to} in {path}')

    with open(path, 'wb') as annotation:
        new_content = lxml.html.tostring(tree)

        if new_content.strip():
            annotation.write(new_content)

    print(f'Processing on {path} done')


if __name__ == '__main__':
    for xml_file in glob.glob('*.xml'):
        if exists(xml_file):
            update_xml(path=xml_file)


annotation.xml:

<annotation>
<folder>./test_xmls</folder>
<filename>000048_Panorama.jpg</filename>
<path>./images000048_Panorama.jpg</path>
<source>
<database>Unknown</database>
</source>
<size>
<width>4000</width>
<height>2000</height>
<depth>3</depth>
</size>
<segmented>0</segmented>
<object>
<name>AAAA</name>
<name>BBBB</name>
<name>CCCC</name>
<pose>Unspecified</pose>
<truncated>0</truncated>
<difficult>0</difficult>
<bndbox></bndbox></object></annotation>

conversions.csv:

original,change to
AAAA,class_A
BBBB,class_B
CCCC,class_C

答案 1 :(得分:0)

import lxml.html  # check https://pypi.org/project/lxml/
from csv import reader
import xml.etree.ElementTree as ET

if __name__ == '__main__':
    with open('./table.csv', 'r') as convertions:
        csv_reader = reader(convertions)
        root_path = "./xmls"

        xml_list = sorted(os.listdir(root_path))

        for xml_file in xml_list:
            xml_path = os.path.join(root_path,xml_file)
            #tree = lxml.html.fromstring(xml_path.read())
            # parse xml file
            tree = ET.parse(xml_path)

        for idx, row in enumerate(csv_reader, start=1):
            if idx == 1:
                continue

            original_col, change_to = row

            tags = tree.xpath(f".//name[text()='{original_col}']")

            for tag in tags:
                tag.text = change_to

                print(f'Changed class {original_col} to {change_to}')


            new_content = lxml.html.tostring(tree)
            print(new_content)

        if new_content.strip():
            tree.write(new_content)