我试图在Python中使用difflib.SequenceMatcher
来返回最大的公共字符串
string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print match
print(string1[match.a: match.a + match.size])
string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44,"""
string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44"""
match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size])
所以基本上在比较string1
和string2
[前两行]时会返回CRITICAL ha_test_util.py:44
,
当我从string1
和string2
[第6行和第7行]剪切一些行时,它返回ERROR agave_util.py:64 Timed out waiting for
基本上我的问题是为什么序列匹配器在我的第一个案例中没有返回正确的匹配?
答案 0 :(得分:3)
您遇到SequenceMatcher
自动垃圾启发式的影响(在您的情况下为负面)。来自docs:
自动垃圾启发式:
SequenceMatcher
支持启发式操作,可自动将某些序列项视为垃圾。启发式计算每个单独项目在序列中出现的次数。如果项目的重复项(在第一项之后)占序列的1%以上且序列长度至少为200项,则此项目将标记为“热门”,并且为了序列匹配而被视为垃圾。在创建autojunk
时,可以通过将False
参数设置为SequenceMatcher
来关闭此启发式。
在SequenceMatcher
构造函数中,autojunk
默认为True
。如果您尝试使用autojunk=False
,您将获得预期的最长匹配:
from difflib import SequenceMatcher
string1 = """ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off self._host_power_off_test_cycle(host_of_stargate_master) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert interval=interval, File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
string2 = """ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module> main(FLAGS, sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main worker.run(sync_state) File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run if not self.__test_phase_wrapper(test_method): File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack: File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper func() File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off self._host_power_off_test_cycle(leader_host) File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle self.ha_util.power_off_and_check_ha(host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha self.verify_vms_not_on_host(host_vms, host) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host self.wait_for_vms_power_on(vm_names, per_vm_timeout) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on interval=15) File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true CHECK(result, message) File "/main/.python/util/base/log.py", line 204, in CHECK FATAL(log_msg, **kwargs) File "/main/.python/util/base/log.py", line 185, in FATAL sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
match = SequenceMatcher(None, string1, string2, autojunk=False).find_longest_match(0, len(string1), 0, len(string2))
print(match)
输出:
Match(a=110, b=156, size=534)
可以肯定的是,我们可以检查所有匹配的块并找到最长的块:
>>> max(SequenceMatcher(None, string1, string2, autojunk=False).get_matching_blocks(),
... key=lambda m: m.size)
Match(a=110, b=156, size=534)
为了说明autojunk
对一个更简单的例子的影响,让我们来看看这里发生了什么:
>>> a = "aa:bb:cc" + ":"*200
>>> b = "aa:bb" + ":"*200
>>> SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
Match(a=0, b=0, size=6) # : is classified as junk
>>> SequenceMatcher(None, a, b, autojunk=False).find_longest_match(0, len(a), 0, len(b))
Match(a=8, b=5, size=200) # : is NOT classified as junk
在第一种情况下(默认为autojunk=True
),:
被视为垃圾字符(它代表超过至少200项长度的序列的1%以上),以及结果,"looks right to people"只有6个字符(最初的6个)的最长匹配。
在第二种情况下(使用显式autojunk=False
),垃圾启发式关闭,因此最长匹配是最后200个字符。
如果对较短的序列(短于200个字符)重复相同的测试,您可以看到autojunk
没有区别,因为垃圾启发式关闭(参见source)。 / p>