这是我如何从MS Word文档中提取数据的完整代码。 我想做的是将包含来自MS Word的数据的one_document存储到数组中,但是在打印时遇到错误
This is the error
File "C:/Users/user/Documents/Proj P3/Proj updated 9.10.18 (Tue) trying/FYP/preprocessing/inpatientcare_extractor.py", line 149, in neurosensory_sector
"Date Initiated": getDataList1[0], "Problem": getDataList1[1],
IndexError: list index out of range
请帮我看看如何将one_document存储在数组中:((谢谢 以下代码是我如何从每个Word文档表单元格提取数据的完整代码。
from __future__ import (
absolute_import, division, print_function, unicode_literals
)
import docx
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import os
import os.path
from preprocessing import docs2txt_source
''' ######################################################################## '''
one_document = {}
def iter_block_items(parent):
if isinstance(parent, _Document):
parent_elm = parent.element.body
# print(parent_elm.xml)
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
def noDupCell(row):
prior_tc = None
for cell in row.cells:
this_tc = cell._tc
if this_tc is prior_tc: # skip cells pointing to same `<w:tc>` element
continue
yield cell
prior_tc = this_tc
def progress_notes_sector(source_directory, one_document):
sector = ""
stopper_neuro = " Neurosensory "
'''For Progress Note'''
getDataList = []
document = Document(source_directory)
'''Progress Notes'''
for block in iter_block_items(document):
if isinstance(block, Paragraph):
# if the header is "Progress Notes" , the sector will replace to be "Progress Note"
if (block.text == " Progress Notes "):
sector = block.text
#if the block.text == " Progress Notes "
get2ndData = block.text
#loop and get whatever data that is before stopper_neuro
if (get2ndData == stopper_neuro):
break
#add into the getDataList
else:
getDataList.append(get2ndData)
getFilenameOnly = os.path.basename(source_directory)
categoryType = ('_'.join(getFilenameOnly.split('_')[1:3]))
categoryType = categoryType.replace("_", "")
docNumber = ('_'.join(getFilenameOnly.split('_')[:1]))
getString_pro = ""
for i in range(len(getDataList)):
getString = getDataList[i] + ". "
getString = getString.replace(" Progress Notes . ", "")
getString_pro += getString
'''Progress Note Sector'''
progressList = {"Id": docNumber, "Type": categoryType, "Progress Notes": {"Remarks": getString_pro}}
one_document.update(**progressList)
def neurosensory_sector(source_directory, one_document):
sector = ""
stopper_Cardio = " Cardiovascular "
stopper_neuro = " Neurosensory "
# Status for Neuro
getStatus = ""
'''For Neurosensory'''
getDataList1 = []
list_ass = []
new_r = ""
document = Document(source_directory)
'''Neurosensory Sector'''
for block in iter_block_items(document):
if isinstance(block, Paragraph):
# print(block.text)
#find the word that is beside the assessment
# for word in block.text.split():
# #add the block.text into a list
# list_ass.append(word)
if (block.text == stopper_neuro):
sector = block.text
get2ndData = block.text
# loop and get whatever data that is before stopper_neuro
if (get2ndData == stopper_Cardio):
break
elif (get2ndData == "No" or get2ndData == "Yes"):
getStatus += get2ndData
#it will check is "Assessment:" exist in the get2ndData, If it exist it will replace it blank and store whatever that is beside the Assessment in new_r
if (getStatus == "No"):
if "Assessment: " in get2ndData:
new_r += get2ndData.replace("Assessment: ", ' ')
if isinstance(block, Table):
table = block
for row in table.rows:
for cell in noDupCell(row):
for para1 in cell.paragraphs:
if (sector == stopper_neuro):
if (getStatus == "No"):
get2ndData = para1.text
get2ndData = get2ndData.replace("\n", ' ')
get2ndData = get2ndData.replace("\xa0", ' ')
if (
get2ndData == "Date Initiated" or get2ndData == "Problem" or get2ndData == "Desired Outcome"
or get2ndData == "Nursing Care Plan" or get2ndData == "Evaluation"):
print()
else:
getDataList1.append(get2ndData)
else:
getStatus = "Yes"
else:
print()
'''Neurosensory Sector'''
if (getStatus == "Yes"):
neurosensoryList = {"Neurosensory": {"Status": getStatus}}
xferList = neurosensoryList
else:
neurosensoryList = {"Neurosensory": {"Status": getStatus, "Assessment": new_r,
"Date Initiated": getDataList1[0], "Problem": getDataList1[1],
"Desired Outcome": getDataList1[2], "Nursing Care Plan": getDataList1[3],
"Evaluation": getDataList1[4]}}
xferList = neurosensoryList
one_document.update(**xferList)
getFilenameOnly = os.path.basename(source_directory)
if "Admission" in getFilenameOnly:
arr = []
arr += [one_document]
elif "Discharge" in getFilenameOnly:
print(one_document)
else:
print(one_document)
''' ######################################################################## '''
root_dir = r"C:\Users\user\Documents\Project P3\Project updated 9.10.18 (Tue) trying\FYP\dataprep\source_documents"
for filename in os.listdir(root_dir):
source_directory = root_dir + "/" + filename
progress_notes_sector(source_directory, one_document)
neurosensory_sector(source_directory, one_document)
one_document = {}