在Python中使用序列匹配器查找最长的公共字符串

时间:2017-07-13 11:50:47

标签: python sequence

我试图在Python中使用difflib.SequenceMatcher来返回最大的公共字符串

string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print match
print(string1[match.a: match.a + match.size])

string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44,"""
string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44"""
match = SequenceMatcher(None, string1, string2).find_longest_match(0,    len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size])

所以基本上在比较string1string2 [前两行]时会返回CRITICAL ha_test_util.py:44, 当我从string1string2 [第6行和第7行]剪切一些行时,它返回ERROR agave_util.py:64 Timed out waiting for

基本上我的问题是为什么序列匹配器在我的第一个案例中没有返回正确的匹配?

1 个答案:

答案 0 :(得分:3)

您遇到SequenceMatcher 自动垃圾启发式的影响(在您的情况下为负面)。来自docs

  

自动垃圾启发式SequenceMatcher支持启发式操作,可自动将某些序列项视为垃圾。启发式计算每个单独项目在序列中出现的次数。如果项目的重复项(在第一项之后)占序列的1%以上且序列长度至少为200项,则此项目将标记为“热门”,并且为了序列匹配而被视为垃圾。在创建autojunk时,可以通过将False参数设置为SequenceMatcher来关闭此启发式。

SequenceMatcher构造函数中,autojunk默认为True。如果您尝试使用autojunk=False,您将获得预期的最长匹配:

from difflib import SequenceMatcher

string1 = """ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
string2 = """ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

match = SequenceMatcher(None, string1, string2, autojunk=False).find_longest_match(0, len(string1), 0, len(string2))
print(match)

输出:

Match(a=110, b=156, size=534)

可以肯定的是,我们可以检查所有匹配的块并找到最长的块:

>>> max(SequenceMatcher(None, string1, string2, autojunk=False).get_matching_blocks(),
...     key=lambda m: m.size)
Match(a=110, b=156, size=534)

为了说明autojunk对一个更简单的例子的影响,让我们来看看这里发生了什么:

>>> a = "aa:bb:cc" + ":"*200
>>> b = "aa:bb" + ":"*200
>>> SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
Match(a=0, b=0, size=6)     # : is classified as junk
>>> SequenceMatcher(None, a, b, autojunk=False).find_longest_match(0, len(a), 0, len(b))
Match(a=8, b=5, size=200)   # : is NOT classified as junk

在第一种情况下(默认为autojunk=True),:被视为垃圾字符(它代表超过至少200项长度的序列的1%以上),以及结果,"looks right to people"只有6个字符(最初的6个)的最长匹配。

在第二种情况下(使用显式autojunk=False),垃圾启发式关闭,因此最长匹配是最后200个字符。

如果对较短的序列(短于200个字符)重复相同的测试,您可以看到autojunk没有区别,因为垃圾启发式关闭(参见source)。 / p>