这是CFS_Config.txt
中的数据。文件夹路径将存储在root_dir
中。 source_documents
文件夹包含2个不同的文件。
Folder Path = C:\Users\user\Documents\Lynn\FYPJ P3\FYP updated 9.10.18 (Tues) trying\FYP\dataprep\source_documents
ED Notes name = Notes
Admission name = Adm
Discharge name = Dis
Output = ../dataprep/docs2txt_output
这是所有文件将在for循环中循环然后在文本文件(在docx2txt.py
中打印)中的代码
def read_config():
# open existing file to read configuration
cfs_config_txt = open("../CFS_Config.txt", "r")
file_list = []
root_dir = ""
ednotes_name = ""
admission_name = ""
discharge_name = ""
output = ""
for line in cfs_config_txt:
file_list.append(line)
if "Folder Path = " in file_list[0]:
root_dir = str(file_list[0])
root_dir = root_dir.replace("Folder Path = ", "")
root_dir = root_dir.replace("\n", "")
if "ED Notes name = " in file_list[1]:
ednotes_name = str(file_list[1])
ednotes_name = ednotes_name.replace("ED Notes name = ", "")
ednotes_name = ednotes_name.replace("\n", "")
if "Admission name = " in file_list[2]:
admission_name = str(file_list[2])
admission_name = admission_name.replace("Admission name = ", "")
admission_name = admission_name + ".txt"
admission_name = admission_name.replace("\n", "")
if "Discharge name = " in file_list[3]:
discharge_name = str(file_list[3])
discharge_name = discharge_name.replace("Admission name = ", "")
discharge_name = discharge_name + ".txt"
discharge_name = discharge_name.replace("\n", "")
if "Output = " in file_list[4]:
output = str(file_list[4])
output = output.replace("Output = ", "")
output = output + ".txt"
output = output.replace("\n", "")
return root_dir, ednotes_name, admission_name, discharge_name, output
#Below is the codes to loop every file in the root_dir. The root_dir will
contain the folder path that read from the CFS_Config.txt file.
def convert_txt(choices):
root_dir, ednotes_name, admission_name, discharge_name, output =
read_config()
if(choices == 1):
# open new file to write string data textfile
text_file = open(output, 'w', encoding='utf-8')
text_file.write("cat_id|content\n")
for filename in os.listdir(root_dir):
source_directory = root_dir + '/' + filename
getFilenameOnly = os.path.basename(source_directory)
#print(getFilenameOnly)
whole_string = ""
document = ""
document += docx2txt.process(source_directory)
print(document)
if ednotes_name in getFilenameOnly:
arr = ednotes_extractor.get_ednotes(source_directory)
list2str = str(arr)
c = cleanString(newstring=list2str)
new_arr = []
new_arr += [c]
# open existing file to append the items in the array to the previously written textfile
text_file = open(output, 'a', encoding='utf-8')
for item in new_arr:
text_file.write("%s\n" % item)
elif admission_name in getFilenameOnly:
categoryType = ('_'.join(getFilenameOnly.split('_')[1:3]))
categoryType = categoryType.replace("_", "")
categoryType = categoryType.replace("Cat", "")
categoryType = categoryType.replace(" ", "")
for word in document.split():
whole_string += word + " "
whole_string = delete_phrase(whole_string)
whole_string = delete_header(whole_string)
text_file = open(output, "a", encoding='utf-8')
text_file.write("\n")
text_file.write(categoryType + '|' + whole_string)
当我打印root_dir
时,其中有两个不同的文件。
The output of print(root_dir):
883056_Cat_7_Notes.docx
883434_Cat_7_Patient_Adm.docx
883056_Cat_7_Patient_Dis.docx
683700_Cat_6_Notes.docx
588300_Cat_6_Patient_Dis.docx
588817_Cat_4_Notes.docx
问题是他们只打印所有数据
.......Notes.docx
个文件。
请帮我看看代码,谢谢! :((
答案 0 :(得分:1)
您的问题是,您在配置中将变量ednotes_name
定义为Notes
,因此脚本只读取三个Notes
之后具有_
的文件。 / p>