来自GoLang中的GetDocumentAnalysis的AWS Textract键值对

时间:2019-04-16 19:34:31

标签: amazon-web-services pdf go machine-learning keyvaluepair

我正在发送给Textract的PDF文档上调用AWS Textract GetDocumentAnalysisRequest,该文档返回数据块。然后,我试图将数据转换为键值映射。分析了该文档的表格和表格。我试图将python中提供的示例代码移植到go中,以将块转换为KV Maps(https://github.com/awsdocs/aws-doc-sdk-examples/blob/master/python/example_code/textract/textract-python-kv-parser.py)。在python中工作正常。在我的代码中,键和值中有随机空格。我假设是由于代码text = fmt.Sprintf("%s %s", text, *word.Text)中的这一行。

// Contains tells whether a contains x.
func Contains(a []*string, x string) bool {
    for _, n := range a {
        if x == *n {
            return true
        }
    }
    return false
}

type KeyValueBlock struct {
    KeyMap   map[string]textract.Block
    ValueMap map[string]textract.Block
    BlockMap map[string]textract.Block
}

func getKeyValueMap(blocks []*textract.Block) KeyValueBlock {
    keyMap := map[string]textract.Block{}
    valueMap := map[string]textract.Block{}
    blockMap := map[string]textract.Block{}

    for _, block := range blocks {
        blockID := block.Id
        blockMap[*blockID] = *block

        if *block.BlockType == textract.BlockTypeKeyValueSet {

            if Contains(block.EntityTypes, textract.EntityTypeKey) {
                keyMap[*blockID] = *block
            } else {
                valueMap[*blockID] = *block
            }
        }
    }

    return KeyValueBlock{
        KeyMap:   keyMap,
        ValueMap: valueMap,
        BlockMap: blockMap,
    }
}

func findValueBlock(keyBlock textract.Block, valueMap map[string]textract.Block) textract.Block {
    var valueBlock textract.Block
    for _, relationship := range keyBlock.Relationships {
        if *relationship.Type == textract.EntityTypeValue {
            for _, valueID := range relationship.Ids {
                valueBlock = valueMap[*valueID]
            }
        }
    }
    return valueBlock
}

func getText(result textract.Block, blocksMap map[string]textract.Block) string {
    var text string

    for _, relationship := range result.Relationships {
        if *relationship.Type == textract.RelationshipTypeChild {
            for _, childID := range relationship.Ids {
                word := blocksMap[*childID]
                if *word.BlockType == textract.BlockTypeWord {
                    text = fmt.Sprintf("%s %s", text, *word.Text)
                }
            }
        }
    }

    return text
}

func getKeyValueRelationship(keyValueBlock KeyValueBlock) {
    keyValueMap := map[string]string{}
    for _, keyBlock := range keyValueBlock.KeyMap {
        valueBlock := findValueBlock(keyBlock, keyValueBlock.ValueMap)
        key := getText(keyBlock, keyValueBlock.BlockMap)
        val := getText(valueBlock, keyValueBlock.BlockMap)
        keyValueMap[key] = val
    }
    log.Info(keyValueMap)
}

// Do takes a GetDocumentAnalysisInput and sends to textract
func Do(docAnalysisInput *textract.GetDocumentAnalysisInput, svc textractiface.TextractAPI) error {
    // Sending a request using the StartDocumentAnalysisRequest method.
    req, resp := svc.GetDocumentAnalysisRequest(docAnalysisInput)

    err := req.Send()
    if err == nil { // resp is now filled
        log.Infof("Job completeted with status: %s", *resp.JobStatus)
    }
    data := getKeyValueMap(resp.Blocks)
    getKeyValueRelationship(data)
    return err
}

获取: {“ level”:“ info”,“ msg”:“ map [+:测试Key1(提供的总金额):6002.00圣诞老人键名称:: Some Text Some Key3:0.00 Some Charge:$ 5000.00 Som e Key6:5.88 %一些句子键:值单词一些长键:一些Key2:#552242]“,” time“:” 2019-04-16T19:06:18Z“}

预期 {“ level”:“ info”,“ msg”:“ map [测试Key1(提供的总金额):6002.00圣诞老人键名称:Some Text Some Key3:0.00 Some Charge:$ 5000.00 Some Key6:5.88%Some句子关键字:值字Some Long Key:Some Key2:#552242]“,” time“:” 2019-04-16T19:06:18Z“}

0 个答案:

没有答案