使用Python解析XML属性,'和"人物分裂

时间:2017-10-05 16:10:12

标签: python xml

我正在使用NVD XML并尝试解析和拆分XML以最终进入数据库。我遇到的问题是解析的XML attrib是"或者'围绕价值观。我无法拆分这些字符串。我已经包含了代码和当前失败的条目。预期的输出是

product,america's_first_federal_credit_union,america's_first_fcu_mobile_banking

代码

#!/usr/bin/env python
import os
import sys
import time
from subprocess import call
import xml.etree.ElementTree
import re

range_from = 2017
range_to = 2017

def process_entry(entry):
    cve = entry.attrib.get("name")
    print cve
    cpes = get_cpes_affected(entry)


def get_cpes_affected(entry):
    child = []
    for e in entry.iter():
        if "}prod" in e.tag:
            print e.attrib
            print unichr(34)
            if unichr(34) in e.attrib:
                print "hey yo"
                child.append("product," + str(e.attrib).split('"')[1] + "," + str(e.attrib).split('"')[3])
            else:
                child.append("product," + str(e.attrib).split("'")[3] + "," + str(e.attrib).split("'")[7])
            #print e.tag, e.attrib
        if "'prev'" in e.attrib:
            child.append("version," + str(e.attrib).split("'")[7] + "," + str(e.attrib).split("'")[3])
        if "}vers" in e.tag and "'prev'" not in e.attrib:
            child.append("version," + str(e.attrib).split("'")[3] + ",")
            #print e.tag, e.attrib
    for derp in child:
        print derp

for i in range(range_from, range_to+1):
    os.system("wget -O tmp.zip https://nvd.nist.gov/download/nvdcve-%i.xml.zip" % i)
    os.system("unzip -o tmp.zip")
    e = xml.etree.ElementTree.parse('nvdcve-%i.xml' % i).getroot()

    for entry in e:
        process_entry(entry)

正在解析的XML条目示例

    <entry type="CVE" name="CVE-2017-5916" seq="2017-5916" published="2017-05-05" modified="2017-05-16" severity="Medium" CVSS_version="2.0" CVSS_score="4.3" CVSS_base_score="4.3" CVSS_impact_subscore="2.9" CVSS_exploit_subscore="8.6" CVSS_vector="(AV:N/AC:M/Au:N/C:P/I:N/A:N)">
<desc>
  <descript source="cve">The America's First Federal Credit Union (FCU) Mobile Banking app 3.1.0 for iOS does not verify X.509 certificates from SSL servers, which allows man-in-the-middle attackers to spoof servers and obtain sensitive information via a crafted certificate.</descript>
</desc>
<loss_types>
  <conf/>
</loss_types>
<range>
  <network/>
</range>
<refs>
  <ref source="MISC" url="https://medium.com/@chronic_9612/follow-up-76-popular-apps-confirmed-vulnerable-to-silent-interception-of-tls-protected-data-64185035029f" adv="1">https://medium.com/@chronic_9612/follow-up-76-popular-apps-confirmed-vulnerable-to-silent-interception-of-tls-protected-data-64185035029f</ref>
</refs>
<vuln_soft>
  <prod name="america's_first_fcu_mobile_banking" vendor="america's_first_federal_credit_union">
    <vers num="3.1.0" prev="1" edition=":~~~iphone_os~~"/>
  </prod>
</vuln_soft>

输入

失败
{'vendor': "america's_first_federal_credit_union", 'name': "america's_first_fcu_mobile_banking"}

只是要包含一个字符串的例子,它能够分开而没有问题

{'vendor': 'emirates_nbd_bank_p.j.s.c', 'name': 'emirates_nbd_ksa'}

抱歉忘了包含错误

Traceback (most recent call last):
  File "prev-version-load.py", line 49, in <module>
    process_entry(entry)
  File "prev-version-load.py", line 18, in process_entry
    cpes = get_cpes_affected(entry)
  File "prev-version-load.py", line 33, in get_cpes_affected
    child.append("product," + str(e.attrib).split("'")[3] + "," + str(e.attrib).split("'")[7])
IndexError: list index out of range

2 个答案:

答案 0 :(得分:0)

考虑替换......

if "}prod" in e.tag:
    print unichr(34)
    if unichr(34) in e.attrib:
        print "hey yo"
        child.append("product," + str(e.attrib).split('"')[1] + "," + str(e.attrib).split('"')[3])
    else:
        child.append("product," + str(e.attrib).split("'")[3] + "," + str(e.attrib).split("'")[7])
    #print e.tag, e.attrib
if "'prev'" in e.attrib:
    child.append("version," + str(e.attrib).split("'")[7] + "," + str(e.attrib).split("'")[3])
if "}vers" in e.tag and "'prev'" not in e.attrib:
    child.append("version," + str(e.attrib).split("'")[3] + ",")

使用...

reg=r"\"|'(?=[^\"]*')|'(?=\W*\")"
if "prod" in e.tag:
  #print(re.split(reg,str(e.attrib)))
  child.append("product," + re.split(reg,str(e.attrib))[3] + "," + re.split(reg,str(e.attrib))[7])
    #print e.tag, e.attrib
if "prev" in e.attrib:
    child.append("version," + re.split(reg,str(e.attrib))[7] + "," + re.split(reg,str(e.attrib))[3])
if "vers" in e.tag and "prev" not in e.attrib:
    child.append("version," + re.split(reg,str(e.attrib))[3] + ",")    

如果有效,请告诉我,我会解释。

<强>更新

更好的解决方案如下: -

    if "prod" in e.tag:
        #print(e.attrib)
        child.append("product," + e.attrib['name'] + "," + e.attrib['vendor'])
    if "prev" in e.attrib:
        child.append("version," + e.attrib['prev'] + "," + e.attrib['num'])
    if "vers" in e.tag and "prev" not in e.attrib:
        child.append("version," + e.attrib['num'] + ",")

对于您的所有三种情况,我的原始解决方案和更新的解决方案,您给定xml的工作示例为here

答案 1 :(得分:0)

这与解析xml无关,而是与格式化输出的方式无关。

与shell脚本不同,大多数事情只是字符串,你可以只做字符串摆弄以获得你想要的输出,python是面向对象的语言,Python中的对象有类型。特别是e.attrib是一种字典类型,你不能在字典上进行字符串操作。

我建议使用ElementTree的findall()方法而不是做我认为你想做的事情。例如,我认为这就是你真正想要做的事情:

#!/usr/bin/env python
from xml.etree import ElementTree as ET

range_from = 2017
range_to = 2017

def process_entry(entry):
    cve = entry.attrib.get("name")
    print cve
    cpes = get_cpes_affected(entry)


def get_cpes_affected(entry):
    prods = entry.findall('nvd:vuln_soft/nvd:prod', namespaces=namespaces)
    for prod in prods:
        print prod.attrib
        print '"'
    for prod in prods:
        print "product,{},{}".format(prod.attrib['vendor'], prod.attrib['name'])
        for vers in prod.findall('nvd:vers', namespaces=namespaces):
            if vers.get('edition'):
                print "version,{},".format(vers.attrib['edition'])
            elif vers.get('prev') == '1':
                print "version,{},".format(vers.attrib['prev'])
            else:
                print "version,{},".format(vers.attrib['num'])


namespaces = {'nvd': 'http://nvd.nist.gov/feeds/cve/1.2'}
# OPTIONAL: registering namespace is useful for outputting XML with ET.tostring()/ET.dump()
#for prefix, ns in namespaces.items():
#    ET.register_namespace(prefix, ns)

for i in range(range_from, range_to+1):
    e = ET.parse('nvdcve-%i.xml' % i).getroot()

    for entry in e:
        process_entry(entry)