我有从输入XML文件开始的这段代码,
该代码工作正常,但不会删除其值包含字符串“ 10.238”的键。下面是完整的代码:
import re
from xml.dom import minidom
from xml.etree import ElementTree as ET
def filter_values_by_keyword(my_dict, filter_by):
"""
Return a list of values which contains `filter_by` keyword.
Arguments:
my_dict (dict): Dict containing (...data specifics here)
filter_by (str): Keyword to look for in values of my_dict
Return:
List of filtered values
"""
return [key for key, value in my_dict.items() if filter_by in value]
def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2):
"""
Your docstring here.
"""
data = {}
xml_tree = minidom.parse(xml_path)
item_group_nodes = xml_tree.getElementsByTagName(tag_name_1)
for idx, item_group_node in enumerate(item_group_nodes):
cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2)
for _ in cl_compile_nodes:
data[idx]=[item_group_node.toxml()]
return data
def main():
data = get_xml_by_tag_names('output2.xml', 'new_line', 'text')
filtered_values = filter_values_by_keyword(data, '10.238')
for item in filtered_values:
del data[item]
mylist = []
uncinata1 = " < "
uncinata2 = " >"
punto = "."
virgola = ","
puntoevirgola = ";"
dash = "-"
puntoesclamativo = "!"
duepunti = ":"
apostrofo = "’"
puntointerrogativo = "?"
angolate = "<>"
for value in data.values():
myxml = ' '.join(value)
# print(myxml)
tree = ET.fromstring(myxml)
lista = ([text.text for text in tree.findall('text')])
testo = (' '.join(lista))
testo = testo.replace(uncinata1, "")
testo = testo.replace(uncinata2, "")
testo = testo.replace(punto, "")
testo = testo.replace(virgola, "")
testo = testo.replace(puntoevirgola, "")
testo = testo.replace(dash, "")
testo = testo.replace(puntoesclamativo, "")
testo = testo.replace(duepunti, "")
testo = testo.replace(apostrofo, "")
testo = testo.replace(puntointerrogativo, "")
testo = testo.replace(angolate, "")
print(testo)
find_prima = re.compile(r"\]\s*prima(?!\S)")
find_fase_base = re.compile(r"\]\s*AN\s*([\w\s]+)\s*da\scui\sT") # ] AN parole da cui T
find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T
find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T
find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T)
find_fase_12_leo = re.compile(
r"(?!.*da cui)\]\s+AN\s1\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)\s+2\s+([a-zA-Z]+(?:\s+[a-zA-Z]+)*)") # ] AN 1 parole da cui 2 parole escludendo da cui dopo
find_fase_12T_leo = re.compile(
r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T") # ] AN 1 parole da cui 2 parole parola da cui T
matches_prima = re.findall(find_prima, testo)
lunghezza_prima = len(matches_prima)
mylist.append(lunghezza_prima)
count = 0
for elem in mylist:
count += elem
print(count)
if __name__ == "__main__":
main()
但是重要的是:
def filter_values_by_keyword(my_dict, filter_by):
return [key for key, value in my_dict.items() if filter_by in value]
,然后在main()函数中:
filtered_values = filter_values_by_keyword(data, '10.238')
for item in filtered_values:
del data[item]
它按原样返回文本,我不明白为什么。
编辑:
这是我的XML的示例,它实际上具有重复的pages
标签:
<pages>
<page id="1" bbox="0.000,0.000,462.047,680.315" rotate="0">
<textbox id="0" bbox="191.745,592.218,249.042,603.578">
<textline>
<new_line>
<text font="NUMPTY+ImprintMTnum" bbox="297.284,540.828,300.188,553.310" colourspace="DeviceGray" ncolour="0" size="12.482">della quale non conosce che una parte;] </text>
<text font="PYNIYO+ImprintMTnum-Italic" bbox="322.455,540.839,328.251,553.566" colourspace="DeviceGray" ncolour="0" size="12.727">prima</text>
<text font="NUMPTY+ImprintMTnum" bbox="331.206,545.345,334.683,552.834" colourspace="DeviceGray" ncolour="0" size="7.489">1</text>
<text font="NUMPTY+ImprintMTnum" bbox="177.602,528.028,180.850,540.510" colourspace="DeviceGray" ncolour="0" size="12.482">che nonconosce ancora appieno;</text>
<text font="NUMPTY+ImprintMTnum" bbox="189.430,532.545,192.908,540.034" colourspace="DeviceGray" ncolour="0" size="7.489">2</text>
<text font="NUMPTY+ImprintMTnum" bbox="203.879,528.028,208.975,540.510" colourspace="DeviceGray" ncolour="0" size="12.482">che</text>
</new_line>
</textline>
</textbox>
</page>
</pages>
答案 0 :(得分:1)
如果更换,该怎么办
return [key for key, value in my_dict.items() if filter_by in value]
作者:
return [key for key, value in my_dict.items() if filter_by == value]
或(更具可读性的努力):
更新
输入键是str
类型的,根据您的评论,该值也是字符串类型。只需创建一个字符串a="this is my 10.238 number"
,操作"10.238" in a
就可以得到一个不错的True
。
在处理具有“ 10.238”的字符串时,我会仔细检查两个运算符的类型。
my_keys=[]
for key, value in my_dict.items()
if isinstance(value, list):
if filter_by in value:
my_keys.append(key)
elif isinstance(value, str):
print("compare {} to {}".format(type(filter_by), type(value)))
if filter_by in value or value.index(filter_by) > -1:
my_keys.append(key)
else:
print("ops! {}".format(type(value)))
return my_keys