我正在尝试从docx文件中提取页面和标题数据。该文件是几百页,每个页面都有一个表和一个标题。标题具有需要与每个表配对的相关信息。我能够提取标题和表格数据,我无法将它们可靠地配对在一起。
使用win32com这是我到目前为止所拥有的
# getting the table page number
app = Dispatch("Word.Application")
doc = app.Documents.Open(filename)
table_1_page = doc.Tables(1).Range.Information(3) # 3 == wdActiveEndPageNumber
出现问题是因为标题TextFrames并且在多个页面上重复,所以当我调用时:
# getting the header page number
doc.Sections(1).Headers(1).Shapes(1).TextFrame.TextRange.Information(3)
我得到TextFrame发生的其中一个页面。页面选择似乎有点武断,有时它的第一个是最后一个,但它不可预测。
我花了一些时间阅读对象模型here。最终,捕捉每页显示的所有项目而不重新发明轮子会很好。
# filename docx_parser.py
import pythoncom
class OpenDoc(object):
def __init__(self, docx_path):
import win32com.client as win32
self.path = docx_path
self.word = win32.Dispatch("Word.Application")
self.word.Visible = 0
self.word.Documents.Open(p)
self.doc = self.word.ActiveDocument
def get_table_count(self):
return self.doc.Tables.Count
def count_table_rows(self, table):
return table.Rows.Count
def count_table_columns(self, table):
return table.Columns.Count
def get_headers(self):
headers = self.doc.Sections(1).Headers(1)
shape_count = headers.Shapes.Count
for shape_num in range(1, shape_count + 1):
t_range = headers.Shapes(shape_num).TextFrame.TextRange
text = t_range.Text
page_num = t_range.Information(3) # 3 == wdActiveEndPageNumber
yield text, page_num
def get_table_text(self, table):
col_count = self.count_table_columns(table)
row_count = self.count_table_rows(table)
for row in range(1, row_count + 1):
row_data = []
for col in range(1, col_count + 1):
try:
row_data.append(table.Cell(Row=row, Column=col).Range.Text.strip(chr(7) + chr(13)))
except pythoncom.com_error as error:
row_data.append("")
yield row_data
def get_all_table_text(self):
for table in self.get_tables():
table_data = []
for row_data in self.get_table_text(table):
table_data.append(row_data)
yield table_data
def get_tables(self):
for table in self.doc.Tables:
yield table
def __del__(self):
self.word.Quit()
if __name__ == "__main__":
try:
path = r"sample.docx"
open_doc = OpenDoc(path)
for table_num, table_text in enumerate(open_doc.get_all_table_text()):
print("\n-------------- Table %s ----------------" % (table_num + 1))
for row_data in table_text:
print(", ".join(row_data))
for header_text, page_num in open_doc.get_headers():
print("header page number: %s, text: %s" % (page_num, header_text))
except Exception as error:
from traceback import format_exc
print(format_exc())
raw_input("")