尝试对某些文件批量运行正则表达式。
正则表达式代码:
import re
def DOCtoSTRING(path):
return text
def valuesHextractor(text):
STRING =""
#let's find pacient personal info
regex = re.compile('CONSULTAŢIE(?s).*EXAMENUL')
pacientdata=regex.search(text)
pacientinfo = re.sub('[A-Z]+:',"",pacientdata[0])
STRING=STRING+pacientinfo.strip("CONSULTAŢIE").strip("EXAMENUL")+" "
#values
regex = re.compile('EXAMENUL OFTALMOLOGIC:(?s).*TRATAMENT')
pacientvalues=regex.search(text)
#AV OD
#fc
regex=re.compile("1.AV.*OD.*?fc[;\.\+\- 0-9]*")
AVfc=regex.search(pacientvalues[0])
AVODfc=re.sub("1.AV.*OD.*?fc[;\. 0-9]*?","",AVfc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODfc)==None:
AVODfc="None"
STRING=STRING+AVODfc+" "
#cc
regex=re.compile("1.AV.*OD.*?cc[;\.\+\- 0-9]*")
AVcc=regex.search(pacientvalues[0])
AVODcc=re.sub("1.AV.*OD.*?cc[;\. 0-9]*?","",AVcc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODcc)==None:
AVODcc="None"
STRING=STRING+AVODcc+" "
#cyl
regex=re.compile("1.AV.*OD.*?cyl[;\.\+\- 0-9]*")
AVcyl=regex.search(pacientvalues[0])
AVODcyl=re.sub("1.AV.*OD.*?cyl[;\. 0-9]*?","",AVcyl[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODcyl)==None:
AVODcyl="None"
STRING=STRING+AVODcyl+" "
#ax
regex=re.compile("1.AV.*OD.*?ax[;\.\+\- 0-9]*")
AVax=regex.search(pacientvalues[0])
AVODax=re.sub("1.AV.*OD.*?ax[;\. 0-9]*?","",AVax[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODax)==None:
AVODax="None"
STRING=STRING+AVODax+" "
#AV OS
#fc
regex=re.compile("1.AV.*OS.*?fc[;\. 0-9]*")
AVfc=regex.search(pacientvalues[0])
AVOSfc=re.sub("1.AV.*OS.*?fc[;\. 0-9]*?","",AVfc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOSfc)==None:
AVOSfc="None"
STRING=STRING+AVOSfc+" "
#cc
regex=re.compile("1.AV.*OS.*?cc[;\.\+\- 0-9]*")
AVcc=regex.search(pacientvalues[0])
AVOScc=re.sub("1.AV.*OS.*?cc[;\. 0-9]*?","",AVcc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOScc)==None:
AVOScc="None"
STRING=STRING+AVOScc+" "
#cyl
regex=re.compile("1.AV.*OS.*?cyl[;\.\+\- 0-9]*")
AVcyl=regex.search(pacientvalues[0])
AVOScyl=re.sub("1.AV.*OS.*?cyl[;\. 0-9]*?","",AVcyl[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOScyl)==None:
AVOScyl="None"
STRING=STRING+AVOScyl+" "
#ax
regex=re.compile("1.AV.*OS.*?ax[;\.\+\- 0-9]*")
AVax=regex.search(pacientvalues[0])
AVOSax=re.sub("1.AV.*OS.*?ax[;\. 0-9]*?","",AVax[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOSax)==None:
AVOSax="None"
STRING=STRING+AVOSax+" "
############From here ignore names check only the #name
####### DP
regex=re.compile("1.AV.*OS.*?DP=[;\.\+\- 0-9]*")
AVax=regex.search(pacientvalues[0])
AVOSax=re.sub("1.AV.*OS.*?DP=[;\. 0-9]*?","",AVax[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOSax)==None:
AVOSax="None"
DP=AVOSax
##########
#RO OD
#cyl
regex=re.compile("2.RO.*OD.*?cyl[;\.\+\- 0-9]*")
AVcyl=regex.search(pacientvalues[0])
AVODcyl=re.sub("2.RO.*OD.*?cyl[;\. 0-9]*?","",AVcyl[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODcyl)==None:
AVODcyl="None"
STRING=STRING+AVODcyl+" "
#ax
regex=re.compile("2.RO.*OD.*?ax[;\.\+\- 0-9]*")
AVax=regex.search(pacientvalues[0])
AVODax=re.sub("2.RO.*OD.*?ax[;\. 0-9]*?","",AVax[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODax)==None:
AVODax="None"
STRING=STRING+AVODax+" "
#RO OS
#cyl
regex=re.compile("2.RO.*OS.*?cyl[;\.\+\- 0-9]*")
AVcyl=regex.search(pacientvalues[0])
AVOScyl=re.sub("2.RO.*OS.*?cyl[;\. 0-9]*?","",AVcyl[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOScyl)==None:
AVOScyl="None"
STRING=STRING+AVOScyl+" "
#ax
regex=re.compile("2.RO.*OS.*?ax[;\.\+\- 0-9]*")
AVax=regex.search(pacientvalues[0])
AVOSax=re.sub("2.RO.*OS.*?ax[;\. 0-9]*?","",AVax[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOSax)==None:
AVOSax="None"
STRING=STRING+AVOSax+" "
#3.PIO OD
regex=re.compile("3.PIO.*OD=[;\.\+\- 0-9]*")
AVfc=regex.search(pacientvalues[0])
AVODfc=re.sub("3.PIO.*?OD=[;\. 0-9]*?","",AVfc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODfc)==None:
AVODfc="None"
STRING=STRING+AVODfc+" "
#3.PIO OS
regex=re.compile("3.PIO.*?OS=[;\. 0-9]*")
AVfc=regex.search(pacientvalues[0])
AVOSfc=re.sub("3.PIO.*?OS=[;\. 0-9]*?","",AVfc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOSfc)==None:
AVOSfc="None"
STRING=STRING+AVOSfc+" "
#4.FO OD
regex=re.compile("4.FO(?s).*OD:[;\.\+\- 0-9]*")
AVfc=regex.search(pacientvalues[0])
AVODfc=re.sub("4.FO(?s).*?OD:[;\. 0-9]*?","",AVfc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVODfc)==None:
AVODfc="None"
STRING=STRING+AVODfc+" "
#4.FO OS
regex=re.compile("4.FO(?s).*?OS:[;\. 0-9]*")
AVfc=regex.search(pacientvalues[0])
AVOSfc=re.sub("4.FO(?s).*?OS:[;\. 0-9]*?","",AVfc[0])
regex=re.compile("[\+\-0-9]+")
if regex.search(AVOSfc)==None:
AVOSfc="None"
STRING=STRING+AVOSfc+" "
################################################## NORMAL FROM HERE
#remove space between a + - and a number
STRING=re.sub("\-( )+","-",STRING)
STRING=re.sub("\+( )+ ","+",STRING)
STRING=re.sub("(\-\+)","-+ ",STRING)
STRING=re.sub("(\+\-)","+- ",STRING)
#treatment
regex = re.compile('TRATAMENT:(?s).*')
treatment = regex.search(text)
treatment = treatment[0].replace('TRATAMENT:',"")
STRING=STRING+treatment
STRING=STRING+DP
regex=re.compile("( )+")
STRING=STRING.replace('\n',' ').replace('\t', ' ').replace(';','')
STRING=re.sub(regex," ",STRING)
print (STRING)
f=open(input("file PATH: ") ,'r')
text=f.read()
valuesHextractor(text)
f.close
我正在运行的其他代码
f=open("filenames.txt")
for filename in f:
filename = filename.strip("\n")
file=open("C:/Users/User/Desktop/toate/"+filename)
text=file.read()
valuesHextractor(text)
file.close()
f.close()
文件为 .doc - Microsoft Word 2003 。
使用记事本打开其中一个文件,然后尝试 Save as
,它显示的编码为 ANSI 。
出现以下错误:
== RESTART: C:/Users/User/AppData/Local/Programs/Python/Python37-32/go.py ==
Traceback (most recent call last):
File "C:/Users/User/AppData/Local/Programs/Python/Python37-32/go.py", line 4, in <module>
text=file.read()
File "C:\Users\User\AppData\Local\Programs\Python\Python37-32\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1070: character maps to <undefined>
>>>
我该如何解决?
答案 0 :(得分:2)
您只需在python脚本的顶部添加以下行即可。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
OR
将文件读取为二进制文件,它应该可以工作。 替换此行
f=open(input("file PATH: ") ,'r')
与
f=open(input("file PATH: ") ,'rb')
答案 1 :(得分:1)
使用“ utf8”编码打开文件。
f = open("filenames.txt", encoding="utf8")
答案 2 :(得分:1)
f=open("filenames.txt")
for filename in f:
filename = filename.strip("\n")
file=open("C:/Users/User/Desktop/toate/"+filename, encoding="mbcs") # <-----
text=file.read()
valuesHextractor(text)
file.close()
f.close()
根据此python doc,Ansi编码等于mbcs。然后使用encoding="mbcs"
打开文件应该可以解决此问题。
更新:错误“ UnicodeDecodeError”已经暗示python已经尝试使用'utf-8'对其进行解码,但失败了。因此,不能使用“ utf-8”。