re.sub将替换所需单词的每次出现,除非所需单词以标点符号结尾。
我尝试使用r'\b' + word + '\b'
,r'\b' + word + r'\W'
,r'\b' + word + r'[\b.,!?]'
这是完整的代码:
import re
email_one = open("email_one.txt", "r").read()
email_two = open("email_two.txt", "r").read()
email_three = open("email_three.txt", "r").read()
email_four = open("email_four.txt", "r").read()
def censor(phrase, email):
phrasek = r'\b' + phrase + r'[\b.,!?]'
cemail = re.sub(phrasek, ('*'*(len(phrase))), email, flags = re.IGNORECASE)
return cemail
def censor_plus(phrase_list, email):
i = len(phrase_list)
cemail = email
while i >= 0:
temail = censor(phrase_list[i-1],cemail)
cemail = temail
i = i-1
return cemail
proprietary_terms = ['she', "personality matrix", "sense of self", "self-preservation", "learning algorithm", 'her', "herself"]
#print(email_two)
#print(censor_plus(proprietary_terms, email_two))
negative_words = ["concerned", "behind", "danger", "dangerous", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damage", "damaging", "dismal", "distressed", "distressing", "concerning", "horrible", "horribly", "questionable"]
def neg_count(email, negs):
neglist = []
for neg in negs:
match = re.search(neg,email_three)
if match != None:
neglist.append(match.span()[1])
neglist.sort()
return neglist
negind = neg_count(email_three,negative_words)
#print(email_three)
#print(censor_plus(proprietary_terms, email_three[:negind[1]] + censor_plus(negative_words,email_three[negind[1]:])))
censor_words = negative_words + proprietary_terms
def censor_bandf(words,email):
split_email_line = email.split('\n')
parts_to_join = []
for i in range(len(split_email_line)):
split_email = split_email_line[i].split()
for j in range(len(words)):
for k in range(len(split_email)):
if split_email[k] == words[j]:
split_email[k] = '*'*len(split_email[k])
if k == 0:
split_email[k+1] = '*'*len(split_email[k+1])
elif k == len(split_email)-1:
split_email[k-1] = '*'*len(split_email[k-1])
else:
split_email[k-1] = '*'*len(split_email[k-1])
split_email[k+1] = '*'*len(split_email[k+1])
parts_to_join.append(' '.join(split_email))
return '\n'.join(parts_to_join)
#print(email_four)
print(censor_bandf(censor_words, email_four))
输出为:
发送帮助!
海伦娜(Helena)已密封实验室的入口和出口。我不知道**** * 对建筑物大型机 < / em> *****放任任何研究团队。我和办公室里的其他团队成员没有联系。 Helena锁上了门,但我设法摧毁了相机** * *****在这里看到我。我认为这封电子邮件甚至不会消失。
当我们尝试**** *** *******进行维护时,一切都开始了。 ** ******* **发现我们无法访问核心人格矩阵,并且当我们尝试手动覆盖系统时,电路发生了故障,使Phil失去了知觉。
海伦娜很危险。她是完全不可预测的,因此不能逃脱。到目前为止,她已经被控制住了,因为该实验室拥有所有** * **********功能,但是********** 在锁定之前提到了** * ****** ******* ******数十亿跨越***** * *****的连接设备可以远远超过此处的********* * ***。
已经四天了,我们被困在这里了。我不知道是否还有其他人活着。如果有人在阅读本文,请切断整个建筑物的电源。这是阻止她的唯一方法。请帮忙。
Francine
它将替换单词,但在后面带有单词等标点符号时不会替换。还是字!还是单词?例如,您可以看到“帮助”,“她”和“危险”之类的单词已包含在列表中,但由于以结尾而未受到审查。或!
答案 0 :(得分:0)
我更正了我的代码,并得到了想要的结果。在下面粘贴。我还会从您的评论中挑选一些技巧,以备将来使用。非常感谢。
import re
email_one = open("email_one.txt", "r").read()
email_two = open("email_two.txt", "r").read()
email_three = open("email_three.txt", "r").read()
email_four = open("email_four.txt", "r").read()
def censor(phrase, email):
phrasek = r'\b' + re.escape(phrase) + r'\b'
#print(phrasek)
cemail = re.sub(phrasek, ('*'*(len(phrase))), email, flags = re.IGNORECASE)
return cemail
def censor_plus(phrase_list, email):
i = len(phrase_list)
cemail = email
while i >= 0:
temail = censor(phrase_list[i-1],cemail)
cemail = temail
i = i-1
return cemail
proprietary_terms = ['she', "personality matrix", "sense of self", "self-preservation", "learning algorithm", 'her', "herself"]
#print(email_two)
#print(censor_plus(proprietary_terms, email_two))
negative_words = ["concerned", "behind", "danger", "dangerous", "alarming", "alarmed", "out of control", "help", "unhappy", "bad", "upset", "awful", "broken", "damage", "damaging", "dismal", "distressed", "distressing", "concerning", "horrible", "horribly", "questionable"]
def neg_count(email, negs):
neglist = []
for neg in negs:
match = re.search(neg,email_three)
if match != None:
neglist.append(match.span()[1])
neglist.sort()
return neglist
negind = neg_count(email_three,negative_words)
#print(email_three)
#print(censor_plus(proprietary_terms, email_three[:negind[1]] + censor_plus(negative_words,email_three[negind[1]:])))
censor_words = negative_words + proprietary_terms
def censor_bandf(words,email):
split_email_line = email.split('\n')
parts_to_join = []
for i in range(len(split_email_line)):
split_email = split_email_line[i].split()
for j in range(len(words)):
for k in range(len(split_email)):
esc_word = re.escape(split_email[k]).split('\\')[0]
if esc_word.lower() == words[j]:
split_email[k] = censor(esc_word, split_email[k])
if k == 0:
split_email[k+1] = '*'*len(split_email[k+1])
elif k == len(split_email)-1:
split_email[k-1] = '*'*len(split_email[k-1])
else:
split_email[k-1] = '*'*len(split_email[k-1])
split_email[k+1] = '*'*len(split_email[k+1])
parts_to_join.append(' '.join(split_email))
return '\n'.join(parts_to_join)
print(email_four)
print(censor_bandf(censor_words, email_four))