我正在使用表单识别器从示例PDF表单(ACORD 3101(2012/02))中提取文本信息,但是在某些字段上返回的“ boundingBox”在我看来似乎不正确。我想知道它背后的原因是什么。
我通过使用多种填写的表格和一种空白的表格来培训服务。当我在经过训练的模型上调用“ / {id} / keys”时,确实看到了被识别的密钥:
{
"clusters": {
"0": ["ADDITIONAL REMARKS", "ADDITIONAL REMARKS SCHEDULE", "Effective Date:", "Form Number:", "Form Title:", "Insured", "Insurer", "Intermediary", "Page", "Policy Number", "This Additional Remarks form is a schedule to ACORD form,", "__Tokens__"]
}
}
对我来说很好。然后,我调用了“ / {id} / analyze” API来提取示例PDF。正如我所说,结果似乎不正确。以下是我得到的Json响应的一部分。
{
"status": "success",
"pages": [
{
"number": 1,
"height": 842,
"width": 595,
"clusterId": 0,
"keyValuePairs": [
{
"key": [
{
"text": "Page",
"boundingBox": [
493.2,
811.6,
514.7,
811.6,
514.7,
801.6,
493.2,
801.6
]
}
],
"value": [
{
"text": "of",
"boundingBox": [
543.6,
811.6,
552.1,
811.6,
552.1,
801.6,
543.6,
801.6
],
"confidence": 1.0
}
]
},
{
"key": [
{
"text": "__Tokens__",
"boundingBox": [
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0,
0.0
]
}
],
"value": [
{
"text": "1",
"boundingBox": [
62.3,
97.3,
62.8,
97.3,
62.8,
96.2,
62.3,
96.2
],
"confidence": 0.24
},
{
"text": "1",
"boundingBox": [
66.6,
97.3,
67.1,
97.3,
67.1,
96.2,
66.6,
96.2
],
"confidence": 0.24
},
{
"text": "John Doe",
"boundingBox": [
2.8,
93.9,
6.9,
93.9,
6.9,
92.8,
2.8,
92.8
],
"confidence": 0.24
},
{
"text": "Taren Liu",
"boundingBox": [
36.4,
93.8,
40.4,
93.8,
40.4,
92.8,
36.4,
92.8
],
"confidence": 0.24
},
{
"text": "23456R02",
"boundingBox": [
2.8,
90.8,
7.2,
90.8,
7.2,
89.8,
2.8,
89.8
],
"confidence": 0.24
},
{
"text": "RBA",
"boundingBox": [
2.8,
87.9,
4.7,
87.9,
4.7,
86.9,
2.8,
86.9
],
"confidence": 0.24
},
{
"text": "11/08/2019",
"boundingBox": [
48.2,
87.9,
53.0,
87.9,
53.0,
86.9,
48.2,
86.9
],
"confidence": 0.24
},
{
"text": "140001",
"boundingBox": [
10.4,
83.3,
13.6,
83.3,
13.6,
82.2,
10.4,
82.2
],
"confidence": 0.24
},
{
"text": "Hello World",
"boundingBox": [
22.6,
83.3,
27.5,
83.3,
27.5,
82.2,
22.6,
82.2
],
"confidence": 0.24
},
{
"text": "This is the second fake form. See",
"boundingBox": [
2.8,
80.9,
17.0,
80.9,
17.0,
79.8,
2.8,
79.8
],
"confidence": 0.24
},
{
"text": "if",
"boundingBox": [
17.3,
80.9,
17.8,
80.9,
17.8,
79.8,
17.3,
79.8
],
"confidence": 0.24
},
{
"text": "the form recognizer can learn from this.",
"boundingBox": [
18.0,
80.9,
34.7,
80.9,
34.7,
79.8,
18.0,
79.8
],
"confidence": 0.24
}
]
}
],
"tables": []
}
],
"errors": []
}
请注意高度和宽度值(分别为842和595)正确。这些是正常的A4纸张尺寸(以磅为单位)。但是,“ John Doe”和“ aren Liu”的字段具有错误的boundingBox信息。显然,那些boundingBoxs收集在纸张的左下角(例如,对于“ John Doe”,它们是2.8、93.9、6.9、93.9、6.9、92.8、2.8、92.8),而不是pdf顶部的预期位置。为什么?
Here is the sample pdf used for both training and analyzing
答案 0 :(得分:0)
您能同时分享匿名但没有真实数据的培训数据吗?
答案 1 :(得分:0)
您是否已验证这些边界框在正确的位置?