如何将所有从MS单词表中提取的数据存储到python中的数组中?

时间:2018-10-09 02:18:07

标签: python arrays extraction

这是我如何从MS Word文档中提取数据的完整代码。 我想做的是将包含来自MS Word的数据的one_document存储到数组中,但是在打印时遇到错误

This is the error 
 File "C:/Users/user/Documents/Proj P3/Proj updated 9.10.18 (Tue) trying/FYP/preprocessing/inpatientcare_extractor.py", line 149, in neurosensory_sector
    "Date Initiated": getDataList1[0], "Problem": getDataList1[1],
IndexError: list index out of range

请帮我看看如何将one_document存储在数组中:((谢谢 以下代码是我如何从每个Word文档表单元格提取数据的完整代码。

from __future__ import (
    absolute_import, division, print_function, unicode_literals
)
import docx
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import os
import os.path
from preprocessing import docs2txt_source


''' ######################################################################## '''

one_document = {}

def iter_block_items(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
        # print(parent_elm.xml)
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def noDupCell(row):
    prior_tc = None
    for cell in row.cells:
        this_tc = cell._tc
        if this_tc is prior_tc:  # skip cells pointing to same `<w:tc>` element
             continue
        yield cell
        prior_tc = this_tc

def progress_notes_sector(source_directory, one_document):
    sector = ""
    stopper_neuro = "  Neurosensory "
    '''For Progress Note'''
    getDataList = []

    document = Document(source_directory)
    '''Progress Notes'''
    for block in iter_block_items(document):
        if isinstance(block, Paragraph):
            # if the header is  "Progress Notes" , the sector will replace to be "Progress Note"
            if (block.text == "  Progress Notes "):
                sector = block.text
            #if the block.text == "  Progress Notes "
            get2ndData = block.text
            #loop and get whatever data that is before stopper_neuro
            if (get2ndData == stopper_neuro):
                break
            #add into the getDataList
            else:
                getDataList.append(get2ndData)


    getFilenameOnly = os.path.basename(source_directory)
    categoryType = ('_'.join(getFilenameOnly.split('_')[1:3]))
    categoryType = categoryType.replace("_", "")
    docNumber = ('_'.join(getFilenameOnly.split('_')[:1]))
    getString_pro = ""


    for i in range(len(getDataList)):
        getString = getDataList[i] + ". "
        getString = getString.replace("  Progress Notes . ", "")
        getString_pro += getString

    '''Progress Note Sector'''
    progressList = {"Id": docNumber, "Type": categoryType, "Progress Notes": {"Remarks": getString_pro}}
    one_document.update(**progressList)

def neurosensory_sector(source_directory, one_document):
    sector = ""
    stopper_Cardio = "  Cardiovascular  "
    stopper_neuro = "  Neurosensory "
    # Status for Neuro
    getStatus = ""

    '''For Neurosensory'''
    getDataList1 = []
    list_ass = []
    new_r = ""
    document = Document(source_directory)

    '''Neurosensory Sector'''
    for block in iter_block_items(document):
        if isinstance(block, Paragraph):
            # print(block.text)

            #find the word that is beside the assessment
            # for word in block.text.split():
            #     #add the block.text into a list
            #     list_ass.append(word)

            if (block.text == stopper_neuro):
                sector = block.text

            get2ndData = block.text
            # loop and get whatever data that is before stopper_neuro
            if (get2ndData == stopper_Cardio):
                break
            elif (get2ndData == "No" or get2ndData == "Yes"):
                getStatus += get2ndData

                #it will check is "Assessment:" exist in the get2ndData, If it exist it will replace it blank and store whatever that is beside the Assessment in new_r
            if (getStatus == "No"):
                if "Assessment: " in get2ndData:
                    new_r += get2ndData.replace("Assessment: ", ' ')

        if isinstance(block, Table):
            table = block
            for row in table.rows:
                for cell in noDupCell(row):
                    for para1 in cell.paragraphs:
                        if (sector == stopper_neuro):
                            if (getStatus == "No"):
                                get2ndData = para1.text
                                get2ndData = get2ndData.replace("\n", ' ')
                                get2ndData = get2ndData.replace("\xa0", ' ')

                                if (
                                        get2ndData == "Date Initiated" or get2ndData == "Problem" or get2ndData == "Desired Outcome"
                                        or get2ndData == "Nursing Care Plan" or get2ndData == "Evaluation"):
                                    print()
                                else:
                                    getDataList1.append(get2ndData)
                            else:
                                getStatus = "Yes"
                        else:
                            print()

    '''Neurosensory Sector'''
    if (getStatus == "Yes"):
        neurosensoryList = {"Neurosensory": {"Status": getStatus}}
        xferList = neurosensoryList
    else:
        neurosensoryList = {"Neurosensory": {"Status": getStatus, "Assessment": new_r,
                                             "Date Initiated": getDataList1[0], "Problem": getDataList1[1],
                                             "Desired Outcome": getDataList1[2], "Nursing Care Plan": getDataList1[3],
                                             "Evaluation": getDataList1[4]}}
        xferList = neurosensoryList

    one_document.update(**xferList)

   getFilenameOnly = os.path.basename(source_directory)

    if "Admission" in getFilenameOnly:
        arr = []
        arr += [one_document]
    elif "Discharge" in getFilenameOnly:
        print(one_document)
    else:
        print(one_document)
''' ######################################################################## '''

root_dir = r"C:\Users\user\Documents\Project P3\Project updated 9.10.18 (Tue) trying\FYP\dataprep\source_documents"

for filename in os.listdir(root_dir):
        source_directory = root_dir + "/" + filename
        progress_notes_sector(source_directory, one_document)
        neurosensory_sector(source_directory,  one_document)
        one_document = {}

0 个答案:

没有答案