合并XML文件并删除重复的行

时间:2015-09-15 22:53:56

标签: python xml linux

我使用以下脚本合并XML文件。有5种不同的XML文件类型,ItemAvailability.xml,ItemUpsert.xml,ItemCatDesc.xml,ItemPrice.xml和ItemDelete.xml。如何使用相同的脚本从组合的XML文件中删除重复的行?

我真的只需要从ItemAvailability.xml文件中删除重复项,该脚本以15分钟的间隔cron作业运行。任何想法都会有所帮助。

当我只运行ItemAvailability.xml时,使用@Sahil M编辑的脚本可以工作,但是当我同时运行所有XML文件时,我收到以下错误:

Traceback (most recent call last):
  File "/var/www/script/mergeXML.py", line 24, in <module>
    run(sys.argv[1:])
  File "/var/www/script/mergeXML.py", line 17, in run
    if ( (row[0].text, row[1].text) ) in withoutDuplicates:
  File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 266, in __getitem__
    return self._children[index]
IndexError: list index out of range

@Sahil M更新了剧本

#!/usr/bin/env python
import sys
from xml.etree import ElementTree

def run(files):
    first = None
    for filename in files:
        data = ElementTree.parse(filename).getroot()
        if first is None:
            first = data
        else:
            first.extend(data)
    # Creating without duplicates here, which contains the unique list of elements determined by values of subelements
    withoutDuplicates = []
    if first is not None:
        for row in first.findall('row'):
            if ( (row[0].text, row[1].text) ) in withoutDuplicates:
                first.remove(row) # Removing duplicate row
            else:
                withoutDuplicates.append((row[0].text, row[1].text))
        print ElementTree.tostring(first)

if __name__ == "__main__":
    run(sys.argv[1:])

原始剧本

#!/usr/bin/env python
import sys
from xml.etree import ElementTree

def run(files):
    first = None
    for filename in files:
        data = ElementTree.parse(filename).getroot()
        if first is None:
            first = data
        else:
            first.extend(data)
    if first is not None:
        print ElementTree.tostring(first)

if __name__ == "__main__":
    run(sys.argv[1:])

要合并文件,请运行:

python xmlcombine.py ?.xml > combined.xml

脚本来源:Merge multiple XML files from command line

示例XML文件。

ItemAvailability.xml

<FF_IT_AP_P_S table="XCHLIB.ITAP00000">
    <row>
        <APITEM type="1" typename="char">AK-S30         </APITEM>
        <APQAVL type="2" typename="numeric">20</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW200P    </APITEM>
        <APQAVL type="2" typename="numeric">23</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW201P    </APITEM>
        <APQAVL type="2" typename="numeric">6</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGOMNIHA300  </APITEM>
        <APQAVL type="2" typename="numeric">5</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90451R         </APITEM>
        <APQAVL type="2" typename="numeric">392</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
</FF_IT_AP_P_S>

ItemUpsert.xml

<AS_Items table="XCHLIB.ITUP018495">
    <row>
        <UPITEM type="1" typename="char">03-020-07128   </UPITEM>
        <UPMFGP type="1" typename="char">03-020-07128   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">815604016780   </UPUPC>
        <UPDSCR type="1" typename="char">DAN KEYMD OFFSET FLSHLGHT MT  </UPDSCR>
        <UPRTPR type="2" typename="numeric">55.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">0.5000</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
    <row>
        <UPITEM type="1" typename="char">01-005-10001   </UPITEM>
        <UPMFGP type="1" typename="char">01-005-10001   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">852548002028   </UPUPC>
        <UPDSCR type="1" typename="char">DAN OMEGA RAIL 7.0 CARBINE    </UPDSCR>
        <UPRTPR type="2" typename="numeric">229.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">1.0000</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
    <row>
        <UPITEM type="1" typename="char">01-147-02038   </UPITEM>
        <UPMFGP type="1" typename="char">01-147-02038   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">815604016704   </UPUPC>
        <UPDSCR type="1" typename="char">DAN SLIM RAIL 12.0 RIFLE      </UPDSCR>
        <UPRTPR type="2" typename="numeric">299.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">1.4000</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
    <row>
        <UPITEM type="1" typename="char">01-147-22026   </UPITEM>
        <UPMFGP type="1" typename="char">01-147-22026   </UPMFGP>
        <UPVNDR type="1" typename="char">DANIEL DEFENSE                </UPVNDR>
        <UPUPC type="1" typename="char">815604016698   </UPUPC>
        <UPDSCR type="1" typename="char">DAN SLIM RAIL 15.0 RIFLE      </UPDSCR>
        <UPRTPR type="2" typename="numeric">299.00</UPRTPR>
        <UPHZCD type="1" typename="char">0</UPHZCD>
        <UPUOM type="1" typename="char">EA </UPUOM>
        <UPWGHT type="2" typename="numeric">1.5500</UPWGHT>
        <CONSUMER_DROP_SHIP type="12" typename="varchar">Y</CONSUMER_DROP_SHIP>
    </row>
</AS_Items>

ItemCatDesc.xml

<STEP-ProductInformation>
    <Products>
        <Product AnalyzerResult="included" ID="094168" ParentID="MM11083" UserTypeID="Item">
            <Name>5659R</Name>
            <ClassificationReference AnalyzerResult="" ClassificationID="" InheritedFrom="" />
            <Values>
                <Value AttributeID="Retail Price">608.69</Value>
                <Value AttributeID="Item Class">02017</Value>
                <Value AttributeID="Long Gun Web Model">Model 870</Value>
                <Value AttributeID="Item Group">02</Value>
                <Value AttributeID="Barrel Length">21 VR/RC:20 RS/FR</Value>
                <Value AttributeID="Consumer Description">Model 870 Express Compact Combo 20 Gauge 21 Inch Vent Rib Barrel and 20 Inch Fully Rifled Deer Barrel Synthetic Stock Matte Black Finish 4 Round</Value>
                <Value AttributeID="Gauge">20</Value>
                <Value AttributeID="Made In The USA">Made In The USA</Value>
                <Value AttributeID="Item Code">5659R</Value>
                <Value AttributeID="Catalog Vendor Name">REMINGTON</Value>
                <Value AttributeID="Capacity">4</Value>
                <Value AttributeID="Chamber Inch">3</Value>
                <Value AttributeID="Item Status">OPEN</Value>
                <Value AttributeID="Package Guns">Package Guns</Value>
                <Value AttributeID="Wildcat Eligible">N</Value>
                <Value AttributeID="Item Description">REM 870 EXP CPT COMBO 20 21/20</Value>
                <Value AttributeID="Primary Vendor">253LG</Value>
                <Value AttributeID="Caliber-Gauge">20 GA.</Value>
                <Value AttributeID="InventoryTyp">REG</Value>
                <Value AttributeID="ProductPageNumber">&lt;NoVersionPageNo/&gt;</Value>
                <Value AttributeID="Master Model Body Copy">Black synthetic stock. Black matte metal finish - non-reflective. Twenty one inch vent rib barrel with RemChoke and single bead sights and a twenty inch fully rifled deer barrel with rifled sights. Four round magazine capacity.</Value>
                <Value AttributeID="Master Model Header">Model 870 Express Compact Synthetic Combo</Value>
                <Value AttributeID="Vendor Group">REM</Value>
            </Values>
            <AssetCrossReference AnalyzerResult="included" AssetID="rem_5659r" Type="Primary Image" />
        </Product>
        <Product AnalyzerResult="included" ID="135950" ParentID="MM202159" UserTypeID="Item">
            <Name>85118</Name>
            <ClassificationReference AnalyzerResult="" ClassificationID="" InheritedFrom="" />
            <Values>
                <Value AttributeID="Retail Price">776.00</Value>
                <Value AttributeID="Item Class">02022</Value>
                <Value AttributeID="Long Gun Web Model">Model 930</Value>
                <Value AttributeID="Item Group">02</Value>
                <Value AttributeID="Barrel Length">24</Value>
                <Value AttributeID="Consumer Description">Model 930 Jerry Miculek JM Pro Series 12 Gauge 3 Inch Chamber 24 Inch Vent Rib Barrel Black Synthetic Stock 9 Round</Value>
                <Value AttributeID="Gauge">12</Value>
                <Value AttributeID="Made In The USA">Made In The USA</Value>
                <Value AttributeID="Item Code">85118</Value>
                <Value AttributeID="Catalog Vendor Name">MOSSBERG</Value>
                <Value AttributeID="Choke">F, M, IC</Value>
                <Value AttributeID="Capacity">9</Value>
                <Value AttributeID="Stock">Black Synthetic</Value>
                <Value AttributeID="Chamber Inch">3</Value>
                <Value AttributeID="Finish">Matte Blue</Value>
                <Value AttributeID="Item Status">OPEN</Value>
                <Value AttributeID="Wildcat Eligible">N</Value>
                <Value AttributeID="Item Description">MOS 930 JM 12 3 24VR SYN 9</Value>
                <Value AttributeID="Primary Vendor">215RT</Value>
                <Value AttributeID="Caliber-Gauge">12 GA.</Value>
                <Value AttributeID="InventoryTyp">REG</Value>
                <Value AttributeID="ProductPageNumber">&lt;NoVersionPageNo/&gt;</Value>
                <Value AttributeID="Master Model Body Copy">Competition level engineering. Features a beveled loading gate and shortened forend for ease of loading and an added barrel clamp for extended mag tube stability. Dual gas system for recoil reduction. Accu-set choke tubes includes Full, Modified and Improved Cylinder. Engraved receiver. Knurled charging handle for retracting the bolt. Quick-empy magazine button for unloading. Matte blue vent-rib barrel. Black synthetic stock and forend with checkering. Fiber optic front sight. Three inch chamber. Gun lock included.</Value>
                <Value AttributeID="Master Model Header">Model 930 Jerry Miculek JM Pro Series</Value>
                <Value AttributeID="Vendor Group">MOS</Value>
            </Values>
            <AssetCrossReference AnalyzerResult="included" AssetID="MOS_85118" Type="Primary Image" />
        </Product>
        <Product AnalyzerResult="included" ID="157866" ParentID="MM241817" UserTypeID="Item">
            <Name>JRTF614</Name>
            <ClassificationReference AnalyzerResult="" ClassificationID="" InheritedFrom="" />
            <Values>
                <Value AttributeID="Retail Price">950.00</Value>
                <Value AttributeID="Item Class">02046</Value>
                <Value AttributeID="Item Group">02</Value>
                <Value AttributeID="Barrel Length">22</Value>
                <Value AttributeID="Consumer Description">Tikka T3 Forest .22-250 Remington 22 Inch Barrel Blue Finish No Sights Walnut Stock 3 Round</Value>
                <Value AttributeID="Item Code">JRTF614</Value>
                <Value AttributeID="Caliber">.22-250 Rem.</Value>
                <Value AttributeID="Catalog Vendor Name">BERETTA</Value>
                <Value AttributeID="Capacity">3</Value>
                <Value AttributeID="Finish">Blue</Value>
                <Value AttributeID="Item Status">OPEN</Value>
                <Value AttributeID="Wildcat Eligible">N</Value>
                <Value AttributeID="Item Description">BER T3 FORST 22250 22 BL WLN 3</Value>
                <Value AttributeID="Primary Vendor">120TK</Value>
                <Value AttributeID="Caliber-Gauge">.22-250 REM.</Value>
                <Value AttributeID="InventoryTyp">REG</Value>
                <Value AttributeID="ProductPageNumber">&lt;NoVersionPageNo/&gt;</Value>
                <Value AttributeID="Master Model Body Copy">Features a roll-over cheek piece on the checkered walnut stock. Adjustable single stage trigger. Two-way linear safety. Sling swivel studs. No sights. Black rubber recoil pad. Cold hammer forged, free-floating barrel with recessed and precision crowned muzzle. Single column detachable magazine with flush release.</Value>
                <Value AttributeID="Master Model Header">Tikka T3 Forest</Value>
                <Value AttributeID="Vendor Group">BER</Value>
            </Values>
            <AssetCrossReference AnalyzerResult="included" AssetID="BER_JRTF615" Type="Primary Image" />
        </Product>
</STEP-ProductInformation>

ItemPrice.xml

<ICA_00023535_PR_P_S table="XCHLIB.ITPR00000">
    <row>
        <PRITEM type="1" typename="char">03-020-07128   </PRITEM>
        <PRPRICE type="2" typename="numeric">41.77</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-005-10001   </PRITEM>
        <PRPRICE type="2" typename="numeric">178.48</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-02038   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-22026   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">03-045-18025   </PRITEM>
        <PRPRICE type="2" typename="numeric">65.50</PRPRICE>
    </row>
</ICA_00023535_PR_P_S>

ItemDelete.xml

<FF_IT_DL_P_S table="XCHLIB.ITDL00000">
    <row>
        <DLITEM type="1" typename="char">S3H5V242412PHC </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX82424411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX81224411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">5-93801        </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX81224411    </DLITEM>
    </row>
</FF_IT_DL_P_S>

对于这里的混乱感到抱歉,我并没有尝试将所有不同的文件类型合并为1.以下是我设置所有内容的方式。

在15分钟的时间间隔内,我自动下载一批文件,每种类型通常有3或4个文件所以我最终会得到这样的结果:

ItemAvailability-111.xml,ItemAvailability-222.xml,ItemAvailability-333.xml ItemUpsert-111.xml,ItemUpsert-222.xml,ItemUpsert-333.xml ItemCatDesc-111.xml,ItemCatDesc-222.xml,ItemCatDesc-333.xml ItemPrice-111.xml,ItemPrice-222.xml,ItemPrice-333.xml ItemDelete-111.xml,ItemDelete-222.xml

然后我的主脚本对这些文件进行排序并在每种文件类型上运行合并脚本,所以当它完成后我有5个文件(ItemAvailability-combined.xml,ItemUpsert-combined.xml,ItemCatDesc-combined.xml ,ItemPrice-combined.xml和ItemDelete-combined.xml)导入我的商店。

1 个答案:

答案 0 :(得分:1)

以下内容从合并文件中删除重复的行。使用每行的子元素提取重复的行。

#!/usr/bin/env python
import sys
from xml.etree import ElementTree

def run(files):
    first = None
    for filename in files:
        data = ElementTree.parse(filename).getroot()
        if first is None:
            first = data
        else:
            first.extend(data)
    # Creating without duplicates here, which contains the unique list of elements determined by values of subelements
    withoutDuplicates = []
    if first is not None:
        for row in first.findall('row'):
            if ( (row[0].text, row[1].text) ) in withoutDuplicates:
                first.remove(row) # Removing duplicate row
            else:
                withoutDuplicates.append((row[0].text, row[1].text))
        print ElementTree.tostring(first)

if __name__ == "__main__":
    run(sys.argv[1:])

将文件与自身合并以演示来自其他文件的重复条目:

python mergeXML.py sample.xml sample.xml

结果:

<FF_IT_AP_P_S table="XCHLIB.ITAP018495">
    <row>
        <APITEM type="1" typename="char">AK-S30         </APITEM>
        <APQAVL type="2" typename="numeric">20</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW200P    </APITEM>
        <APQAVL type="2" typename="numeric">23</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW201P    </APITEM>
        <APQAVL type="2" typename="numeric">6</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGOMNIHA300  </APITEM>
        <APQAVL type="2" typename="numeric">5</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90451R         </APITEM>
        <APQAVL type="2" typename="numeric">392</APQAVL>
    </row>
    </FF_IT_AP_P_S>

根据问题中的更新,只需将设置比较更改为单个元素比较。所以你的for循环应该是:

    for row in first.findall('row'):
        if ( row[0].text) in withoutDuplicates:
            first.remove(row) # Removing duplicate row
        else:
            withoutDuplicates.append(row[0].text)

当你可以运行

python mergeXML.py sample.xml itemDelete.xml itemPrice.xml > mergedFile.xml

mergedFile.xml是:

<FF_IT_AP_P_S table="XCHLIB.ITAP00000">
    <row>
        <APITEM type="1" typename="char">AK-S30         </APITEM>
        <APQAVL type="2" typename="numeric">20</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90500H         </APITEM>
        <APQAVL type="2" typename="numeric">2159</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">AP60218        </APITEM>
        <APQAVL type="2" typename="numeric">68</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW200P    </APITEM>
        <APQAVL type="2" typename="numeric">23</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGLOW201P    </APITEM>
        <APQAVL type="2" typename="numeric">6</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">ATIGOMNIHA300  </APITEM>
        <APQAVL type="2" typename="numeric">5</APQAVL>
    </row>
    <row>
        <APITEM type="1" typename="char">90451R         </APITEM>
        <APQAVL type="2" typename="numeric">392</APQAVL>
    </row>
    <row>
        <DLITEM type="1" typename="char">S3H5V242412PHC </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX82424411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">APX81224411    </DLITEM>
    </row>
    <row>
        <DLITEM type="1" typename="char">5-93801        </DLITEM>
    </row>
    <row>
        <PRITEM type="1" typename="char">03-020-07128   </PRITEM>
        <PRPRICE type="2" typename="numeric">41.77</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-005-10001   </PRITEM>
        <PRPRICE type="2" typename="numeric">178.48</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-02038   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">01-147-22026   </PRITEM>
        <PRPRICE type="2" typename="numeric">228.97</PRPRICE>
    </row>
    <row>
        <PRITEM type="1" typename="char">03-045-18025   </PRITEM>
        <PRPRICE type="2" typename="numeric">65.50</PRPRICE>
    </row>
</FF_IT_AP_P_S>

您可以拥有包含完整标签的任意数量的文件。