Question

我需要在许多docx文件中删除页眉和页脚。我目前正在尝试使用python-docx库，但此时它不支持docx文档中的页眉和页脚（正在进行中）。

有没有办法在Python中实现这一点？

据我了解，docx是一种基于xml的格式，但我不知道如何使用它。

P.S.I有一个想法，使用lxml或BeautifulSoup来解析xml并替换一些部分，但它看起来很脏

UPD。感谢Shawn，这是一个很好的起点。我对脚本做了一些修改。这是我的最终版本（它对我有用，因为我需要编辑许多.docx文件。我使用BeautifulSoup，因为标准的xml解析器无法获得有效的xml树。，我的docx文档在xml中没有页眉和页脚。他们只是将页眉和页脚的图像放在页面顶部。另外，为了更快的速度你可以使用lxml而不是Soup

import zipfile
import shutil as su
import os
import tempfile
from bs4 import BeautifulSoup


def get_xml_from_docx(docx_filename):
    """
        Return content of document.xml file inside docx document
    """
    with zipfile.ZipFile(docx_filename) as zf:
        xml_info = zf.read('word/document.xml')
    return xml_info


def write_and_close_docx(self, edited_xml, output_filename):
    """ Create a temp directory, expand the original docx zip.
        Write the modified xml to word/document.xml
        Zip it up as the new docx
    """
    tmp_dir = tempfile.mkdtemp()

    with zipfile.ZipFile(self) as zf:
        zf.extractall(tmp_dir)

    with open(os.path.join(tmp_dir, 'word/document.xml'), 'w') as f:
        f.write(str(edited_xml))

    # Get a list of all the files in the original docx zipfile
    filenames = zf.namelist()
    # Now, create the new zip file and add all the filex into the archive
    zip_copy_filename = output_filename
    docx = zipfile.ZipFile(zip_copy_filename, "w")
    for filename in filenames:
        docx.write(os.path.join(tmp_dir, filename), filename)

    # Clean up the temp dir
    su.rmtree(tmp_dir)


if __name__ == '__main__':
    directory = 'your_directory/'
    files = os.listdir(directory)
    for file in files:
        if file.endswith('.docx'):
            word_doc = directory + file
            new_word_doc = 'edited/' + file.rstrip('.docx') + '-edited.docx'
            tree = get_xml_from_docx(word_doc)
            soup = BeautifulSoup(tree, 'xml')
            shapes = soup.find_all('shape')
            for shape in shapes:
                if 'margin-left:0pt' in shape.get('style'):
                    shape.parent.decompose()
            write_and_close_docx(word_doc, soup, new_word_doc)

所以，那就是它:)我知道，代码并不干净，对不起。

Answer 1

好吧，我从未想过它，但我刚刚创建了一个带有页眉和页脚的test.docx。获得该docx之后，您可以unzip获取该XML文件。对于我的简单测试用例，这产生了：

word/
_rels           footer1.xml     styles.xml
document.xml        footnotes.xml       stylesWithEffects.xml
endnotes.xml        header1.xml     theme
fontTable.xml       settings.xml        webSettings.xml

打开word/documents.xml会为您提供主要问题区域。您可以看到其中包含页眉和页脚的元素。在我的简单案例中，我得到了：

<w:headerReference w:type="default" r:id="rId7"/>
<w:footerReference w:type="default" r:id="rId8"/>

和

<w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="720" w:footer="720" w:gutter="0"/>

所有的文档实际上都很小，所以

<w:document xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mo="http://schemas.microsoft.com/office/mac/office/2008/main" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:mv="urn:schemas-microsoft-com:mac:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">
<w:body>
  <w:p w:rsidR="009E6E8F" w:rsidRDefault="009E6E8F"/>
  <w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA"/>
  <w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA"/><w:p w:rsidR="00B53FFA" w:rsidRDefault="00B53FFA">
  <w:r>
  <w:t>MY BODY</w:t>
  </w:r>
  <w:bookmarkStart w:id="0" w:name="_GoBack"/>
  <w:bookmarkEnd w:id="0"/>
  </w:p>
  <w:sectPr w:rsidR="00B53FFA" w:rsidSect="009E6E8F">
  <w:headerReference w:type="default" r:id="rId7"/> 
  <w:footerReference w:type="default" r:id="rId8"/>
  <w:pgSz w:w="12240" w:h="15840"/>
  <w:pgMar w:top="1440" w:right="1800" w:bottom="1440" w:left="1800" w:header="720" w:footer="720" w:gutter="0"/>"""

因此，无论是在功能上还是在性能上，XML操作都不会成为一个问题。这里有一些代码可以让你的doc进入python，解析为xml树，并作为docx保存回来。我现在必须出去所以这不是你的完整解决方案，但我认为这应该让你顺利完成。如果你仍然遇到麻烦，我会稍后回来，看看你在哪里。

import zipfile
import shutil as su
import os
import tempfile
import xml.etree.cElementTree


 def get_word_xml(docx_filename):
   with open(docx_filename, mode='rt') as f:
      zip = zipfile.ZipFile(f)
      xml_content = zip.read('word/document.xml')
   return xml_content


def write_and_close_docx (self, xml_content, output_filename):
        """ Create a temp directory, expand the original docx zip.
            Write the modified xml to word/document.xml
            Zip it up as the new docx
        """

        tmp_dir = tempfile.mkdtemp()

        self.zipfile.extractall(tmp_dir)

        with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
            xmlstr = tree.tostring(xml_content, pretty_print=True)
            f.write(xmlstr)

        # Get a list of all the files in the original docx zipfile
        filenames = self.zipfile.namelist()
        # Now, create the new zip file and add all the filex into the archive
        zip_copy_filename = output_filename
        with zipfile.ZipFile(zip_copy_filename, "w") as docx:
            for filename in filenames:
                docx.write(os.path.join(tmp_dir,filename), filename)

        # Clean up the temp dir
        su.rmtree(tmp_dir)

def get_xml_tree(f):
    return xml.etree.ElementTree.parse(f)

word_doc = 'TEXT.docx'
new_word_doc = 'SLIM.docx'
doc = get_word_xml(word_doc)
tree = get_xml_tree(doc)
write_and_close_docx(word_doc, tree, new_word_doc)

Python - 从docx文件中删除页眉和页脚

1 个答案: