通过ElementTree提取<strong>文本

时间:2016-11-15 07:56:54

标签: python elementtree hocr

我尝试运行以下代码以从XML文件中提取所有文本:

请注意“word_1_14” - 其中word.text被发现是Nonetype因此没有打印出来......我发现这是因为文本带有强标签,因此使其不可见。你知道如何找到带有强标签的单词并打印出来吗?

这一行有问题 - 看来word_1_14的单词是非类型对象...这使得无法打印出文本。

In the Python code:
      for word in ocr_word:
In the XML file:
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>

在我看来,ET.find无法处理用STRONG标记突出显示的XML文本。

Python代码:

##marktag: print the text on top of the image:
#whether it is the area name, or the characters itself
def marktag(xmlObject,draw_img,color,printText,printTag,strongWord=None):
    if printText:
        if xmlObject.text:
            if strongWord:
                textInTag = strongWord.text
                print('debug strong '+textInTag)
            else:
                textInTag = xmlObject.text
            #debug
                print('debug 1:'+textInTag)
            draw.text((bbCoord_x0,bbCoord_y0),textInTag,font = fnt, fill = color)

    return xmlObject

    #processing the image and show it    
    os.chdir('/home/DocData/PDF_DOC/')



file = '2001ABI-7.png'
XMLfilename = file+'.hocr'
tree = ET.parse(XMLfilename) #2550x3300 pixels
root = tree.getroot()
ocr_carea = root.findall(".//{http://www.w3.org/1999/xhtml}div[@class='ocr_carea']")
img = Image.open('/home/bnpp/DocData/PDF_DOC/'+file)
draw = ImageDraw.Draw(img)

area_color = 255
para_color = 145
line_color = 90
word_color = 40
for area in ocr_carea:
    marktag(area,draw,area_color,False,True)


    ocr_para = area.findall(".//{http://www.w3.org/1999/xhtml}p[@class='ocr_par']")
    for para in ocr_para:
        marktag(para,draw,para_color,False,True)
        #some word shown under line
        ocr_line = para.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocr_line']")
        for line in ocr_line:
            marktag(line,draw,line_color,False,True)
            ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")
            for word in ocr_word:
                try:
                    strong_word =word[0].text
                except Exception:
                    marktag(word,draw,word_color,True,False)
                    break
                marktag(word,draw,word_color,False,True,strong_word)

这是xml:

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
	xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
	<head>
		<title></title>
		<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
		<meta name='ocr-system' content='tesseract 3.03' />
		<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
	</head>
	<body>
		<div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
			<div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
				<p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
					<span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
						<span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
						<span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
					</span>
				</p>
			</div>
			<div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
				<p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
					<span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
						<span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
						<span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
					</span>
					<span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
						<span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
						<span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
					</span>
					<span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
						<span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
					</span>
				</p>
			</div>
			<div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
				<p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
					<span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
						<span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
					</span>
					<span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
						<span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
					</span>
					<span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
						<span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
							<strong>T</strong>
						</span>
					</span>
					<span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
						<span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
							<strong>I</strong>
						</span>
					</span>
					<span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
						<span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
					</span>
					<span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
						<span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
					</span>
					<span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
						<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
							<strong>I</strong>
						</span>
						<span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
							<strong>I</strong>
						</span>
						<span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Afifine</span>
						<span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
					</span>
					    			
	</body>
</html>

输出:

    bbox 762 112 1394 161
ocr_carea-block_1_1
bbox 762 112 1394 161
ocr_par-par_1_1
bbox 762 112 1394 161; baseline 0 -1
ocr_line-line_1_1
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 762 112 1034 161; x_wconf 91
debug 1:STATION
para_word
bbox 1056 112 1394 161; x_wconf 91
debug 1:LOCATION
bbox 1192 182 1818 318
ocr_carea-block_1_2
bbox 1203 205 1611 307
ocr_par-par_1_2
bbox 1373 205 1611 221; baseline 0 -1
ocr_line-line_1_2
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
bbox 1218 264 1581 281; baseline 0.006 -2
ocr_line-line_1_3
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
bbox 1203 292 1276 307; baseline 0 0
ocr_line-line_1_4
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
para_word
bbox 1373 205 1507 221; x_wconf 80
debug 1:ELEVATION
para_word
bbox 1536 205 1611 221; x_wconf 80
debug 1:ABOVE
para_word
bbox 1218 264 1262 280; x_wconf 88
debug 1:SEA
para_word
bbox 1493 265 1581 281; x_wconf 85
debug 1:GROUND
para_word
bbox 1203 292 1276 307; x_wconf 90
debug 1:LEVEL
bbox 131 211 1057 1378
ocr_carea-block_1_3
bbox 131 211 1057 1378
ocr_par-par_1_3
bbox 1012 211 1028 229; baseline 0 0
ocr_line-line_1_5
bbox 1012 211 1028 229; x_wconf 92
debug 1:L
bbox 1011 236 1027 254; baseline 0 0
ocr_line-line_1_6
bbox 1011 236 1027 254; x_wconf 88
debug 1:A
bbox 1013 261 1027 279; baseline 0 0
ocr_line-line_1_7
bbox 1013 261 1027 279; x_wconf 97
ocrx_word-word_1_10
bbox 1012 286 1020 304; baseline 0 0
ocr_line-line_1_8
bbox 1012 286 1020 304; x_wconf 97
ocrx_word-word_1_11
bbox 1013 311 1027 329; baseline 0 0
ocr_line-line_1_9
bbox 1013 311 1027 329; x_wconf 97
debug 1:T
bbox 1012 335 1027 354; baseline 0 0
ocr_line-line_1_10
bbox 1012 335 1027 354; x_wconf 92
debug 1:U
bbox 621 360 1030 387; baseline 0.002 -7
ocr_line-line_1_11
bbox 621 383 624 387; x_wconf 50
ocrx_word-word_1_14
bbox 761 383 764 387; x_wconf 50
ocrx_word-word_1_15
bbox 849 362 922 381; x_wconf 68
debug 1:Afifine

1 个答案:

答案 0 :(得分:0)

您可以get_children(),然后您可以从这些孩子那里获得text

for word in ocr_word:

    # get main text as list
    text_main = [word.text.strip()]

    # get children text as list
    text_children = [x.text.strip() for x in word.getchildren()]

    # concatenate lists
    text = text_main + text_children

    # create one string
    text = " ".join(text).strip()

    # result
    print(word.get('id'), text)

最小的工作示例

data = '''<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html
xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta name='ocr-system' content='tesseract 3.03' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "2001ABI-7.png"; bbox 0 0 2550 3300; ppageno 0'>
<div class='ocr_carea' id='block_1_1' title="bbox 762 112 1394 161">
<p class='ocr_par' dir='ltr' id='par_1_1' title="bbox 762 112 1394 161">
<span class='ocr_line' id='line_1_1' title="bbox 762 112 1394 161; baseline 0 -1">
<span class='ocrx_word' id='word_1_1' title='bbox 762 112 1034 161; x_wconf 91' lang='eng' dir='ltr'>STATION</span>
<span class='ocrx_word' id='word_1_2' title='bbox 1056 112 1394 161; x_wconf 91' lang='eng' dir='ltr'>LOCATION</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 1192 182 1818 318">
<p class='ocr_par' dir='ltr' id='par_1_2' title="bbox 1203 205 1611 307">
<span class='ocr_line' id='line_1_2' title="bbox 1373 205 1611 221; baseline 0 -1">
<span class='ocrx_word' id='word_1_3' title='bbox 1373 205 1507 221; x_wconf 80' lang='eng' dir='ltr'>ELEVATION</span>
<span class='ocrx_word' id='word_1_4' title='bbox 1536 205 1611 221; x_wconf 80' lang='eng' dir='ltr'>ABOVE</span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 1218 264 1581 281; baseline 0.006 -2">
<span class='ocrx_word' id='word_1_5' title='bbox 1218 264 1262 280; x_wconf 88' lang='eng' dir='ltr'>SEA</span>
<span class='ocrx_word' id='word_1_6' title='bbox 1493 265 1581 281; x_wconf 85' lang='eng' dir='ltr'>GROUND</span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 1203 292 1276 307; baseline 0 0">
<span class='ocrx_word' id='word_1_7' title='bbox 1203 292 1276 307; x_wconf 90' lang='eng' dir='ltr'>LEVEL</span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_3' title="bbox 131 211 1057 1378">
<p class='ocr_par' dir='ltr' id='par_1_3' title="bbox 131 211 1057 1378">
<span class='ocr_line' id='line_1_5' title="bbox 1012 211 1028 229; baseline 0 0">
<span class='ocrx_word' id='word_1_8' title='bbox 1012 211 1028 229; x_wconf 92' lang='eng' dir='ltr'>L</span>
</span>
<span class='ocr_line' id='line_1_6' title="bbox 1011 236 1027 254; baseline 0 0">
<span class='ocrx_word' id='word_1_9' title='bbox 1011 236 1027 254; x_wconf 88' lang='eng' dir='ltr'>A</span>
</span>
<span class='ocr_line' id='line_1_7' title="bbox 1013 261 1027 279; baseline 0 0">
<span class='ocrx_word' id='word_1_10' title='bbox 1013 261 1027 279; x_wconf 97' lang='eng' dir='ltr'>
<strong>T</strong>
</span>
</span>
<span class='ocr_line' id='line_1_8' title="bbox 1012 286 1020 304; baseline 0 0">
<span class='ocrx_word' id='word_1_11' title='bbox 1012 286 1020 304; x_wconf 97' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
</span>
<span class='ocr_line' id='line_1_9' title="bbox 1013 311 1027 329; baseline 0 0">
<span class='ocrx_word' id='word_1_12' title='bbox 1013 311 1027 329; x_wconf 97' lang='eng' dir='ltr'>T</span>
</span>
<span class='ocr_line' id='line_1_10' title="bbox 1012 335 1027 354; baseline 0 0">
<span class='ocrx_word' id='word_1_13' title='bbox 1012 335 1027 354; x_wconf 92' lang='eng' dir='ltr'>U</span>
</span>
<span class='ocr_line' id='line_1_11' title="bbox 621 360 1030 387; baseline 0.002 -7">
<span class='ocrx_word' id='word_1_14' title='bbox 621 383 624 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_15' title='bbox 761 383 764 387; x_wconf 50' lang='eng' dir='ltr'>
<strong>I</strong>
</span>
<span class='ocrx_word' id='word_1_16' title='bbox 849 362 922 381; x_wconf 68' lang='eng' dir='ltr'>Afifine</span>
<span class='ocrx_word' id='word_1_17' title='bbox 1012 360 1030 378; x_wconf 88' lang='eng' dir='ltr'>D</span>
</span>
</p>
</div>
</div>
</body>
</html>'''

from xml.etree import ElementTree as ET

tree = ET.fromstring(data)
#root = tree.getroot()

line = tree

ocr_word = line.findall(".//{http://www.w3.org/1999/xhtml}span[@class='ocrx_word']")

for word in ocr_word:
    text_main = [word.text.strip()]
    text_children = [x.text.strip() for x in word.getchildren()]
    text = text_main + text_children
    text = " ".join(text).strip()
    print(word.get('id'), text)