我正在发送给Textract的PDF文档上调用AWS Textract GetDocumentAnalysisRequest,该文档返回数据块。然后,我试图将数据转换为键值映射。分析了该文档的表格和表格。我试图将python中提供的示例代码移植到go中,以将块转换为KV Maps(https://github.com/awsdocs/aws-doc-sdk-examples/blob/master/python/example_code/textract/textract-python-kv-parser.py)。在python中工作正常。在我的代码中,键和值中有随机空格。我假设是由于代码text = fmt.Sprintf("%s %s", text, *word.Text)
中的这一行。
// Contains tells whether a contains x.
func Contains(a []*string, x string) bool {
for _, n := range a {
if x == *n {
return true
}
}
return false
}
type KeyValueBlock struct {
KeyMap map[string]textract.Block
ValueMap map[string]textract.Block
BlockMap map[string]textract.Block
}
func getKeyValueMap(blocks []*textract.Block) KeyValueBlock {
keyMap := map[string]textract.Block{}
valueMap := map[string]textract.Block{}
blockMap := map[string]textract.Block{}
for _, block := range blocks {
blockID := block.Id
blockMap[*blockID] = *block
if *block.BlockType == textract.BlockTypeKeyValueSet {
if Contains(block.EntityTypes, textract.EntityTypeKey) {
keyMap[*blockID] = *block
} else {
valueMap[*blockID] = *block
}
}
}
return KeyValueBlock{
KeyMap: keyMap,
ValueMap: valueMap,
BlockMap: blockMap,
}
}
func findValueBlock(keyBlock textract.Block, valueMap map[string]textract.Block) textract.Block {
var valueBlock textract.Block
for _, relationship := range keyBlock.Relationships {
if *relationship.Type == textract.EntityTypeValue {
for _, valueID := range relationship.Ids {
valueBlock = valueMap[*valueID]
}
}
}
return valueBlock
}
func getText(result textract.Block, blocksMap map[string]textract.Block) string {
var text string
for _, relationship := range result.Relationships {
if *relationship.Type == textract.RelationshipTypeChild {
for _, childID := range relationship.Ids {
word := blocksMap[*childID]
if *word.BlockType == textract.BlockTypeWord {
text = fmt.Sprintf("%s %s", text, *word.Text)
}
}
}
}
return text
}
func getKeyValueRelationship(keyValueBlock KeyValueBlock) {
keyValueMap := map[string]string{}
for _, keyBlock := range keyValueBlock.KeyMap {
valueBlock := findValueBlock(keyBlock, keyValueBlock.ValueMap)
key := getText(keyBlock, keyValueBlock.BlockMap)
val := getText(valueBlock, keyValueBlock.BlockMap)
keyValueMap[key] = val
}
log.Info(keyValueMap)
}
// Do takes a GetDocumentAnalysisInput and sends to textract
func Do(docAnalysisInput *textract.GetDocumentAnalysisInput, svc textractiface.TextractAPI) error {
// Sending a request using the StartDocumentAnalysisRequest method.
req, resp := svc.GetDocumentAnalysisRequest(docAnalysisInput)
err := req.Send()
if err == nil { // resp is now filled
log.Infof("Job completeted with status: %s", *resp.JobStatus)
}
data := getKeyValueMap(resp.Blocks)
getKeyValueRelationship(data)
return err
}
获取: {“ level”:“ info”,“ msg”:“ map [+:测试Key1(提供的总金额):6002.00圣诞老人键名称:: Some Text Some Key3:0.00 Some Charge:$ 5000.00 Som e Key6:5.88 %一些句子键:值单词一些长键:一些Key2:#552242]“,” time“:” 2019-04-16T19:06:18Z“}
预期 {“ level”:“ info”,“ msg”:“ map [测试Key1(提供的总金额):6002.00圣诞老人键名称:Some Text Some Key3:0.00 Some Charge:$ 5000.00 Some Key6:5.88%Some句子关键字:值字Some Long Key:Some Key2:#552242]“,” time“:” 2019-04-16T19:06:18Z“}