Python - 将PDF数据解析为表格格式

时间:2018-03-21 21:05:38

标签: python parsing pdf web-scraping

我正在尝试复制PDF中表格中的数据:http://www.ct.gov/hix/lib/hix/CT_DSG_-12132014_version_1.2_%28with_clarifications%29.pdf

我当前的代码只是拉第一个表的第二页,即文档中的第11页(标记为第2页)。这是我正在使用的代码:

import io, re
import PyPDF2
import requests

url = 'http://www.ct.gov/hix/lib/hix/CT_DSG_-12132014_version_1.2_%28with_clarifications%29.pdf'

r = requests.get(url)
f = io.BytesIO(r.content)

reader = PyPDF2.PdfFileReader(f)
contents = reader.getPage(10).extractText()

data = re.sub( r"([A-Z])", r" \1", contents).split()

csv = open('AWStest.csv', 'w')
csv.write(contents)
csv.close()

我目前能够以粗略的CSV格式提取数据,但无法弄清楚如何解析数据以允许我存储数据以匹配从中删除的表格。这是它当前的样子,所有间距都是CSV格式的换行符:

ColElement 数据 元件 名称日期 ModifiedFormatLengthDescriptionElement 服从 指南 评论 条件 (分母) 推荐的 阈 会员 合格 DataContents 指南 2013年12月5日 3ME003Insurance 类型码/产品 4/1 / 2013Lookup 表

文本 2Type /产品 鉴定 码 报告 该 codethat 定义 该 类型 ofinsurance 下 哪一个 这个 构件' S 合格 ismaintained。 例: HM = HMO 码 描述 9Self 工资 11Other 非 联邦 程式 *(使用 这个的 值 要求 泄露 toDataManager 先 tosubmission) 12Preferred 提供商 组织 (PPO) * 13Point ofService (POS) * 14Exclusive 提供商 组织 (EPO) * 15Indemnity 保险 16Health 保养 组织 (HMO) 医保 风险 (使用 报告 医保 C部分/医疗 优点 计划) 17Dental 保养 组织 (DMO) * 96Husky 健康 A97Husky 健康 B98Husky 健康 C99Husky 健康 DAMAutomobile 医 * CHChampus (现在 TRICARE) * DSDisability * HMHealth 保养 组织 * LMLiability 医 MAMedicare A部分(医疗 Feefor 服务 只要) MBMedicare B部分*(医保 Feefor 服务 只要) MCMedicaid * MDMedicare PartDOFOther 联邦 程序 (使用 这个的 值 要求 泄露 toDataManager 先 tosubmission) TVTitle V VAVeterans 事务 计划 * WCWorkers' 赔偿金 * ZZMutually 定义 *(使用 这个的 值 要求 泄露 toDataManager 先 tosubmission) All96.0%

此示例数据表示标题行和第一行数据。我已经能够打破基于大写的单词,但不幸的是,它将完全大写的单词分解为单个字母。我用了这段代码:

fcsv = open('AWStest.csv', 'r')

for line in fcsv.readlines():
    line = line.strip()
    line.split('[a-zA-Z][^A-Z]*')
    print(re.findall('[A-Z][^A-Z]*', line))

我需要帮助找出以一种格式重现这个完整表的最佳方法,这种格式允许我将其加载到NoSQL数据库并查询各行的要求以生成报告。为了做到这一点,添加到我的代码的最佳方法是什么?是否有更好的方法可以更准确地格式化PDF?

1 个答案:

答案 0 :(得分:0)

听起来好像页面上的文字位置会对你有很大的帮助。我建议使用PyMuPDF提取带有位置数据的文本,以便找到一行。

这是一个代码示例,用于获取带有位置的* .csv文本文件。这可以让你开始用Python挖掘信息。

#!python3.3
""" Use PyMuPDF to extract text to *.csv file. """
import csv
import json
import os
import sys

import fitz

assert len(sys.argv) == 2, 'Pass file name as parameter'

srcfilename = sys.argv[1]
assert os.path.isfile(srcfilename), 'File {} does not exist'.format(srcfilename)

dstfilename = '{}.csv'.format(srcfilename)
with open(dstfilename, 'w', encoding='utf-8', errors='ignore', newline='') as dstfile:
    writer = csv.writer(dstfile)
    writer.writerow([
        'PAGE',
        'X1',
        'Y1',
        'X2',
        'Y2',
        'TEXT',
    ])
    document = fitz.open(srcfilename)
    for page_number in range(document.pageCount):
        text_dict = json.loads(document.getPageText(page_number, output='json'))
        for block in text_dict['blocks']:
            if block['type'] != 'text':
                continue
            for line in block['lines']:
                for span in line['spans']:
                    writer.writerow([
                        page_number,
                        span['bbox'][0],
                        span['bbox'][1],
                        span['bbox'][2],
                        span['bbox'][3],
                        span['text'],
                    ])
    document.close()

这是我编写的一些代码,用于挖掘PDF并将其转换为格式更好的* .csv文件:

#!python3.3
import collections
import csv
import json
import os

import fitz  # PyMuPDF package


class MemberEligibility(object):

    """ Row in Member Eligibility Data Contents Guide table. """

    def __init__(self):
        """
        Initialize object. I've made all fields strings but you may want some to
        be dates or integers.
        """
        self.col = ''
        self.element = ''
        self.data_element_name = ''
        self.date_modified = ''
        self.fmt = ''
        self.length = ''
        self.description = ''
        self.comments = ''
        self.condition = ''
        self.recommended_threshold = ''


def get_sorted_list(document, page_number):
    """
    Get text on specified page of document in sorted list. Each list item is a
    (top-left y-coordinate, top-left x-coordinate, text) tuple. List sorted
    top-to-bottom and then left-to-right. Coordinates converted to integers so
    text with slightly different y-coordinates line up.
    """
    text_dict = json.loads(document.getPageText(page_number, output='json'))
    text_list = []
    for block in text_dict['blocks']:
        if block['type'] == 'text':
            for line in block['lines']:
                for span in line['spans']:
                    text_list.append((
                        int(span['bbox'][1]),  # Top-left y-coordinate
                        int(span['bbox'][0]),  # Top-left x-coordinate
                        span['text'],          # Text itself
                    ))
    text_list.sort()
    return text_list


def main():
    # Downloaded PDF to same folder as this script
    script_dir = os.path.dirname(os.path.abspath(__file__))
    pdf_filename = os.path.join(
        script_dir,
        'CT_DSG_-12132014_version_1.2_(with_clarifications).pdf'
    )

    # Mine PDF for data
    document = fitz.open(pdf_filename)
    # Using OrderedDict so iteration will occur in same order as rows appear in
    # PDF
    member_eligibility_dict = collections.OrderedDict()
    for page_number in range(document.pageCount):
        # Page numbers are zero-based. I'm only looking at p. 11 of PDF here.
        if 10 <= page_number <= 10:
            text_list = get_sorted_list(document, page_number)
            for y, x, text in text_list:
                if 115 < y < 575:
                    # Only look at text whose y-coordinates are within the data
                    # portion of the table
                    if 25 < x < 72:
                        # Assuming one row of text per cell in this column but
                        # this doesn't appear to hold on p. 10 of PDF so may
                        # need to be modified if you're going to do whole table
                        row = MemberEligibility()
                        row.col = text
                        member_eligibility_dict[row.col] = row
                    elif 72 < x < 118:
                        row.element += text
                    elif 118 < x < 175:
                        row.data_element_name += text
                    elif 175 < x < 221:
                        row.date_modified += text
                    elif 221 < x < 268:
                        row.fmt += text
                    elif 268 < x < 315:
                        row.length += text
                    elif 315 < x < 390:
                        row.description += text
                    elif 390 < x < 633:
                        row.comments += text
                    elif 633 < x < 709:
                        row.condition += text
                    elif 709 < x < 765:
                        row.recommended_threshold += text
    document.close()

    # Write data to *.csv
    csv_filename = os.path.join(script_dir, 'EligibilityDataContentsGuide.csv')
    with open(csv_filename, 'w', encoding='utf-8', errors='ignore', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([
            'Col',
            'Element',
            'Data Element Name',
            'Date Modified',
            'Format',
            'Length',
            'Description',
            'Element Submission Guideline Comments',
            'Condition (Denominator)',
            'Recommended Threshold'
        ])
        for row in member_eligibility_dict.values():
            writer.writerow([
                row.col,
                row.element,
                row.data_element_name,
                row.date_modified,
                row.fmt,
                row.length,
                row.description,
                row.comments,
                row.condition,
                row.recommended_threshold
            ])


if __name__ == '__main__':
    main()

你可能需要做更多工作才能得到你想要的东西。