我的代码遇到了一个大问题。
TL; DR:经过一番评论,我决定将整个代码发布在这里:
https://repl.it/repls/AustereShinyBetatest
这是我的代码:
def highlight_nonmodified(content: str) -> str:
regex = re.compile(r'(?s)(\{.*?[^\}]+\})', re.I | re.S)
replace = r'#\1'
content = regex.sub(replace, content)
return content
def get_line(string_t: str, original: str) -> int:
original = original.splitlines(True)
for (i, line) in enumerate(original, 1):
if string_t[1:] in line:
return i
return -1
def highligh_merge(original: str, modified: str) -> str:
for line in modified.splitlines(True):
if line.startswith('#'):
numer = get_line(line, original)
error = r"#Tag not supported at line{0}\n".format(numer)
error = error + line
modified = modified.replace(line, error)
我的问题是,会发生以下情况:
Textfile.txt(原始):
1. Here goes some text. {tag} A wonderful day. It's soon cristmas.
2. Happy 2019, soon. {Some useful tag!} Something else goes here.
3. Happy ending. Yeppe! See you.
4.
5 Happy KKK!
6. Happy B-Day!
7
8. Universe is cool!
9.
10. {Tagish}.
11.
12. {Slugish}. Here goes another line. {Slugish} since this is a new sentence.
13.
14. endline.
Modified.txt:
Here goes some text. A wonderful day. It's soon cristmas.
Happy 2019, soon. #{Some useful tag!} Something else goes here.
Happy ending. Yeppe! See you.
Happy KKK!
Happy B-Day!
Universe is cool!
.
#Error: Tag not supported at line-1\n#{Slugish}. Here goes another line. #{Slugish} since this is a new sentence.
endline.
我似乎无法获得精确的行号和行的比较,我在这里做错了,很明显,我存储了两个副本(原始副本和修改过的副本),然后我选择然后尝试选择通过逐行循环从原始文本中删除行号。但是仍然没有任何成功,甚至可能。在此先多谢!
答案 0 :(得分:4)
如果多行文本可能已被删除,我认为这是不可能的。但是,如果您控制标记过程,则可以在标记中包含原始行号:
curl -XGET http://<host>:9200/_cat/indices?v
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
yellow open .kibana Z04tBGcDQ7OLRqjciAqiDA 1 1
curl -XGET http://<host>:9200/_cat/shards?v
index shard prirep state docs store ip node
.kibana 0 p UNASSIGNED
.kibana 0 r UNASSIGNED
curl -XGET -H 'Content-Type: application/json' http://<host>:9200/_cluster/allocation/explain -d '{"index": ".kibana","shard": 0,"primary": true}'
{"index":".kibana","shard":0,"primary":true,"current_state":"unassigned","unassigned_info":{"reason":"INDEX_CREATED","at":"2019-01-03T09:26:19.270Z","last_allocation_status":"no_attempt"},"can_allocate":"no","allocate_explanation":"cannot allocate because allocation is not permitted to any of the nodes"}
然后恢复它是微不足道的
{ foo:12 }
此代码的修改版本:
original = int(re.search(r'\d+', tag).group(0))
生成此输出:
import re
def annotate_tags(content: str) -> str:
"""Annotate tags with line numbers."""
tag_pattern = re.compile(r'(\{(?P<tag_value>[^}]+)\})')
lines = content.splitlines(True)
annotated_lines = []
for idx, line in enumerate(lines, 1):
annotated_lines.append(tag_pattern.sub(r'{\g<tag_value>:%s}' % idx, line))
annotated = ''.join(annotated_lines)
return annotated
def modify(content: str) -> str:
supported_tags = {
re.compile(r'(\{tag:\d+\})'): r'',
re.compile(r'(\{Tagish:\d+\})'): r''
}
for pattern, replace in supported_tags.items():
matches = pattern.findall(content)
if matches:
content = pattern.sub(replace, content)
return content
def highlight_nonmodified(content: str) -> str:
regex = re.compile(r'(?s)(\{.*?[^\}]+\})', re.I | re.S)
replace = r'#\1'
content = regex.sub(replace, content)
return content
def get_line(string_t: str, original: str) -> int:
tag_pattern = re.compile(r'(\{[^}]+:(?P<line_no>\d+)\})')
match = tag_pattern.search(string_t)
if match:
return match.group('line_no')
return -1
def highlight_merge(original: str, modified: str) -> str:
tag_regex = re.compile(r'#(?s)(\{.*?[^\}]+\})', re.I | re.S)
for line in modified.splitlines(True):
if tag_regex.search(line):
numer = get_line(line, original)
error = "#Tag not supported at line{0}\n".format(numer)
error = error + line
modified = modified.replace(line, error)
return modified
if __name__ == '__main__':
file = 'textfile.txt'
raw = ""
with open(file, 'rt', encoding='utf-8') as f:
for i, s in enumerate(f, 1):
raw += "{}. {}".format(i, s)
original = modified = raw
modified = annotate_tags(modified)
modified = modify(modified)
modified = highlight_nonmodified(modified)
modified = highlight_merge(original, modified)
with open("modified.txt", 'w', encoding='utf-8') as f:
f.write(modified)
答案 1 :(得分:2)
下面是一个简短的脚本,用于导入文件,清理数据,创建枚举的字典并输出结果(基于print_results变量为可选)。
(如果我不能正确解释您的问题,请告诉我!)
import re
from os import path
"""
Create an error class for trying to close a file that isn't open.
"""
class FileException(Exception):
pass
class FileNotOpenError(FileException):
pass
"""
Input variables. base_path is just the directory where your files are located.
If they are in different directories, then use a second variable.
"""
base_path = r'C:\..\[folder containing text files]'
original_filename = 'test_text.txt'
modified_filename = 'modified_text.txt'
def import_data(file_name, root=base_path):
"""
Read each text file into a list of lines.
"""
full_path = path.join(root, file_name)
with open(full_path, 'r') as f:
data = f.readlines()
try:
f.close()
except FileNotOpenError:
pass
if len(data) > 0:
return data
def remove_numbering(input):
"""
RegEx to clean data; This will remove only the line numbers and not
any subsequent number-period combinations in the line.
"""
p = re.compile(r'^([0-9]+[.]?\s)')
return p.sub('', input)
def text_dict(text_list):
"""
Remove numbering from either file; Considers period punctuation following number.
"""
new_text = [remove_numbering(i).lstrip() for i in text_list]
return {idx+1:val for idx, val in enumerate(new_text)}
def compare_files(original, modified, missing_list=None):
# Create a fresh list (probably not necessary)
if missing_list is None:
missing_list = list()
# Ensure that data types are dictionaries.
if isinstance(original, dict) and isinstance(_modified, dict):
# Use list comprehension to compare lines in each file.
# Modified line numbers will end up in a list, which we will return.
modified_index_list = [idx for idx in original.keys() if original[idx] != modified[idx]]
# Check to see if list exists; Return it if it does.
# if len(modified_index_list) > 0:
if not modified_index_list is None:
return modified_index_list
def comparison_findings(missing_list, original_dict, modified_dict):
print('Modifications found on lines:\n- ' + '\n- '.join([str(i) for i in missing_list]))
print('\n\n\tOriginal:\n')
max_len = max([len(original_dict[i].replace('\n','').rstrip()) for i in original_dict.keys() if i in missing_list])
print('\t\t{0:^7}{1:^{x}}'.format('Line','Value',x=max_len))
for i in missing_list:
temp_val = original_dict[i].replace('\n','').rstrip()
print('\t\t{0:>5}{1:2}{2:<{x}}'.format(str(i), '', temp_val, x=max_len))
print('\n\n\tModified:\n')
max_len = max([len(modified_dict[i].replace('\n','').rstrip()) for i in modified_dict.keys() if i in missing_list])
print('\t\t{0:^7}{1:^{x}}'.format('Line','Value',x=max_len))
for i in xyz:
temp_val = modified_dict[i].replace('\n','').rstrip()
print('\t\t{0:>5}{1:2}{2:<{x}}'.format(str(i), '', temp_val, x=max_len))
if __name__ == '__main__':
print_results = True
# Import text files.
orig_data = import_data(original_filename)
mod_data = import_data(modified_filename)
# Create enumerated dictionaries from text files.
_original = text_dict(orig_data)
_modified = text_dict(mod_data)
# Get a list of modified lines.
mod_list = compare_files(_original, _modified)
# Output results of file comparison.
if print_results:
comparison_findings(mod_list, _original, _modified)
答案 2 :(得分:1)
当您在get_line
内调用函数highligh_merge
时,您正在使用修改后的line
变量执行该函数,因此line
实际上将不会位于原始文本文件中。如果您查看line
的值:
#{Slugish}. Here goes another line. #{Slugish} since this is a new sentence.
您可以看到,这显然不在原始textfile.txt中。因此,这将返回行号-1。
一种解决方案是将for
函数内的highligh_merge
循环从以下位置更改:
for line in modified.splitlines(True):
收件人:
for numer, line in enumerate(modified.splitlines(True)):
现在,每次迭代中的numer
等于行数-1。只需使用numer + 1
即可获取正在处理的行的确切行数。
我希望这会有所帮助。 :)