我想规范一个波斯语列表。但是我无法达到理想的结果。 该功能从文本和单词中删除不必要的字符。
normal = [""","»","http://","www.",
"cloob","instagram","https://","t.me","هه","اا","یی","خخ","وو",
"مم","ـــ","؟؟","!!",":)",":((",":))",
"-*","=))","‌","…","∞","غغ","جج",":-*","نن","‏",
"..."," ","دد",":-*","@};-","کک"]
def normalizer(mylst):
n = ""
retrnlst = []
for snt in mylst:
#print(snt)
for n in normal:
for n in snt:
if (n=="http://"):
snt = snt.replace(n, "http:// ")
retrnlst.append(snt)
elif (n=="www."):
snt = snt.replace(n, "www. ")
retrnlst.append(snt)
elif (n=="cloob"):
snt = snt.replace(n, "cloob ")
retrnlst.append(snt)
elif (n=="instagram"):
snt = snt.replace(n, "instagram ")
retrnlst.append(snt)
elif (n=="https://"):
snt = snt.replace(n, "https:// ")
retrnlst.append(snt)
elif (n=="t.me"):
snt = snt.replace(n, "t.me ")
retrnlst.append(snt)
elif (n=="هه"):
snt = snt.replace(n, "ه")
retrnlst.append(snt)
elif (n=="اا"):
snt = snt.replace(n, "ا")
retrnlst.append(snt)
elif (n=="يي"):
snt = snt.replace(n, "ي")
retrnlst.append(snt)
elif (n=="خخ"):
snt = snt.replace(n, "خ")
retrnlst.append(snt)
elif (n=="وو"):
snt = snt.replace(n, "و")
retrnlst.append(snt)
elif (n=="مم"):
snt = snt.replace(n, "م")
retrnlst.append(snt)
elif (n=="غغ"):
snt = snt.replace(n, "غ")
retrnlst.append(snt)
elif (n=="نن"):
snt = snt.replace(n, "ن")
retrnlst.append(snt)
elif (n=="دد"):
snt = snt.replace(n, "د")
retrnlst.append(snt)
elif (n=="کک"):
snt = snt.replace(n, "ک")
retrnlst.append(snt)
elif (n=="گگ"):
snt = snt.replace(n, "گ")
retrnlst.append(snt)
elif (n=="فف"):
snt = snt.replace(n, "ف")
retrnlst.append(snt)
elif (n=="جج"):
snt = snt.replace(n, "ج")
retrnlst.append(snt)
else:
snt = snt.replace(j, "")
retrnlst.append(snt)
return retrnlst
##my input string
var="منننننننن دوستتتتتتتتت دااااااااارممممممممممممم عشقم http://»"
var=normalizer(var)
print(var)
理想的结果是:“ http://مندوستدارمعشقم” 但这是代码的结果:
['م','ن','ن','ن','ن','ن','ن','ن','ن',',','د','و' ,'س','ت','ت','ت','ت','ت','ت','ت','ت','ت','','د','ا ','ا','ا','ا','ا','ا','ا','ا','ا','ر','م','م','م', 'م','م','م','م','م','م','م','م','م','م',','ع','ش' ,'ق','م',',',','h','t','t','p',':','/','/','w','w' ,'w','。','&','r','a','q','u','o',';']