从html文件生成许多pdf文件(输入)

时间:2019-07-01 16:04:36

标签: python

我想问
当我尝试添加其他输入无效时,如何知道我的程序对于一个输入有效,如何从html文件中生成许多pdf作为输入。

def get_static_report_information():

    #Access to the local URL(Html file)
    #html parsing
    for filename in glob.glob(os.path.join(file_input_dir, "*.html")):
        with codecs.open(filename,"rb","utf8") as f:
            page = f.read()#return a list
            page_soup = soup(page,"html.parser")
        print("-------------  before output1 \n")

    tree = html.fromstring(page)
    sections_seperator = page_soup.find("td", {'class':'row_cell'}, string=re.compile(r'\.pactext'))
    #extract la reference du rapport
    ref = page_soup.find("h1",{"id": False})

    #reference_rapport = ref.text.strip()
    reference_rapport = ref.replace("Volume","")
    vers = page_soup.find("td", {'class':'row_cell'}, string=re.compile('\d'))
    version = vers.text.strip()
    nom = page_soup.find("a",{"href": "#1"})
    nom_rapport = nom.text.strip()
    chapters = page_soup.find_all(re.compile('h1|h2'), {'id':re.compile('\d+$')})
    sections_seperator = sections_seperator.get_text()
    sections = page_soup.find_all(re.compile('h3'))
    raw_chapters = [re.sub('\s+$', '', i.text) for i in chapters if i.name == 'h2']
    data = [[i.name, re.sub('\s+$', '', i.text)]for i in chapters]
    grouped, _count = [[a, list(b)]
    for a, b in groupby(data, key=lambda x:x[0] == 'h1')], count(1)
    grouped, _count_list = [[a, list(b)]
    for a, b in groupby(data, key=lambda x:x[0] == 'h1')], count(1)
    new_grouped = [[grouped[i][-1][0][-1], [c for _, c in grouped[i+1][-1]]]
    for i in range(0, len(grouped), 2)]
    table_content = '\n'.join(f'{next(_count)}-{a}+\n'+'\n'.join(f'\t{i}-{c}'
    for i, c in enumerate(b, 1))
    for a, b in new_grouped)

    return raw_chapters, nom_rapport, reference_rapport, page_soup, sections_seperator, new_grouped, version

0 个答案:

没有答案