from docx import Document
alphaDic = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','!','?','.','~',',','(',')','$','-',':',';',"'",'/']
doc = Document('realexample.docx')
docIndex = 0
def delete_paragraph(paragraph):
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
while docIndex < len(doc.paragraphs):
firstSen = doc.paragraphs[docIndex].text
rep_dic = {ord(k):None for k in alphaDic + [x.upper() for x in alphaDic]}
translation = (firstSen.translate(rep_dic))
removeExcessSpaces = " ".join(translation.split())
if removeExcessSpaces != '':
doc.paragraphs[docIndex].text = removeExcessSpaces
else:
delete_paragraph(doc.paragraphs[docIndex])
docIndex -=1 # go one step back in the loop because of the deleted index
docIndex +=1
所以测试文档看起来像这样
Hello
你好
Good afternoon
朋友们
Good evening
晚上好
我试图在下面达到这个结果。
你好
朋友们
晚上好
现在代码删除所有空段落和过多空格并执行此操作,因此我有点卡在这里。我只想删除由英语单词引起的换行符。
你好
朋友们
晚上好
答案 0 :(得分:0)
你能做的就是找英文单词,一旦找到英文单词&#34; WORD&#34;,就附加&#34; \ n&#34;然后删除这个新结果&#34; WORD \ n&#34;从文件。在python中追加字符串的方式是+符号。只做&#34; WORD&#34; +&#34; \ n&#34;