我发现下面的文章要用python做。
https://docs.aws.amazon.com/textract/latest/dg/examples-export-table-csv.html
我也用下面的文章提取文本。
https://docs.aws.amazon.com/textract/latest/dg/detecting-document-text.html
但是上面的文章仅帮助获取文本,我还使用了函数“ block.getBlockType()” 的块,但即使表在image / pdf中,也没有块返回其类型为“ CELL”。
帮我找到类似于“ boto3”的Java库来提取所有表。
答案 0 :(得分:3)
我所做的是,我在json响应中创建了每个数据集的模型,并且可以使用此模型在jsf中构建表视图。
public static List<TableModel> getTablesFromTextract(TextractModel textractModel) {
List<TableModel> tables = null;
try {
if (textractModel != null) {
tables = new ArrayList<>();
List<BlockModel> tableBlocks = new ArrayList<>();
Map<String, BlockModel> blockMap = new HashMap<>();
for (BlockModel block : textractModel.getBlocks()) {
if (block.getBlockType().equals("TABLE")) {
tableBlocks.add(block);
}
blockMap.put(block.getId(), block);
}
for (BlockModel blockModel : tableBlocks) {
Map<Long, Map<Long, String>> rowMap = new HashMap<>();
for (RelationshipModel relationship : blockModel.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel cell = blockMap.get(id);
if (cell.getBlockType().equals("CELL")) {
long rowIndex = cell.getRowIndex();
long columnIndex = cell.getColumnIndex();
if (!rowMap.containsKey(rowIndex)) {
rowMap.put(rowIndex, new HashMap<>());
}
Map<Long, String> columnMap = rowMap.get(rowIndex);
columnMap.put(columnIndex, getCellText(cell, blockMap));
}
}
}
}
tables.add(new TableModel(blockModel, rowMap));
}
System.out.println("row Map " + tables.toString());
}
} catch (Exception e) {
LOG.error("Could not get table from textract model", e);
}
return tables;
}
private static String getCellText(BlockModel cell, Map<String, BlockModel> blockMap) {
String text = "";
try {
if (cell != null
&& CollectionUtils.isNotEmpty(cell.getRelationships())) {
for (RelationshipModel relationship : cell.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
BlockModel word = blockMap.get(id);
if (word.getBlockType().equals("WORD")) {
text += word.getText() + " ";
} else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
if (word.getSelectionStatus().equals("SELECTED")) {
text += "X ";
}
}
}
}
}
}
} catch (Exception e) {
LOG.error("Could not get cell text of table", e);
}
return text;
}
TableModel用于从以下位置创建视图:
public class TableModel {
private BlockModel table;
private Map<Long, Map<Long, String>> rowMap;
public TableModel(BlockModel table, Map<Long, Map<Long, String>> rowMap) {
this.table = table;
this.rowMap = rowMap;
}
public BlockModel getTable() {
return table;
}
public void setTable(BlockModel table) {
this.table = table;
}
public Map<Long, Map<Long, String>> getRowMap() {
return rowMap;
}
public void setRowMap(Map<Long, Map<Long, String>> rowMap) {
this.rowMap = rowMap;
}
@Override
public String toString() {
return table.getId() + " - " + rowMap.toString();
}
答案 1 :(得分:0)
我有类似的东西:
public class AnalyzeDocument {
public DocumentModel startProcess(byte[] content) {
Region region = Region.EU_WEST_2;
TextractClient textractClient = TextractClient.builder().region(region)
.credentialsProvider(EnvironmentVariableCredentialsProvider.create()).build();
return analyzeDoc(textractClient, content);
}
public DocumentModel analyzeDoc(TextractClient textractClient, byte[] content) {
try {
SdkBytes sourceBytes = SdkBytes.fromByteArray(content);
Util util = new Util();
Document myDoc = Document.builder().bytes(sourceBytes).build();
List<FeatureType> featureTypes = new ArrayList<FeatureType>();
featureTypes.add(FeatureType.FORMS);
featureTypes.add(FeatureType.TABLES);
AnalyzeDocumentRequest analyzeDocumentRequest = AnalyzeDocumentRequest.builder().featureTypes(featureTypes)
.document(myDoc).build();
AnalyzeDocumentResponse analyzeDocument = textractClient.analyzeDocument(analyzeDocumentRequest);
List<Block> docInfo = analyzeDocument.blocks();
// util.displayBlockInfo(docInfo);
PageModel pageModel = util.getTableResults(docInfo);
DocumentModel documentModel = new DocumentModel();
documentModel.getPages().add(pageModel);
Iterator<Block> blockIterator = docInfo.iterator();
while (blockIterator.hasNext()) {
Block block = blockIterator.next();
log.debug("The block type is " + block.blockType().toString());
}
return documentModel;
} catch (TextractException e) {
System.err.println(e.getMessage());
}
return null;
}
这是util文件:
public PageModel getTableResults(List<Block> blocks) {
List<Block> tableBlocks = new ArrayList<>();
Map<String, Block> blockMap = new HashMap<>();
for (Block block : blocks) {
blockMap.put(block.id(), block);
if (block.blockType().equals(BlockType.TABLE)) {
tableBlocks.add(block);
log.debug("added table: " + block.text());
}
}
PageModel page = new PageModel();
if (tableBlocks.size() == 0) {
return null;
}
int i = 0;
for (Block table : tableBlocks) {
page.getTables().add(generateTable(table, blockMap, i++));
}
return page;
}
private TableModel generateTable(Block table, Map<String, Block> blockMap, int index) {
TableModel model = new TableModel();
Map<Integer, Map<Integer, String>> rows = getRowsColumnsMap(table, blockMap);
model.setTableId("Table_" + index);
for (Map.Entry<Integer, Map<Integer, String>> entry : rows.entrySet()) {
RowModel rowModel = new RowModel();
Map<Integer, String> value = entry.getValue();
for (int i = 0; i < value.size(); i++) {
rowModel.getCells().add(value.get(i));
}
model.getRows().add(rowModel);
}
return model;
}
private Map<Integer, Map<Integer, String>> getRowsColumnsMap(Block block, Map<String, Block> blockMap) {
Map<Integer, Map<Integer, String>> rows = new HashMap<>();
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block cell = blockMap.get(childId);
if (cell != null) {
int rowIndex = cell.rowIndex();
int colIndex = cell.columnIndex();
if (rows.get(rowIndex) == null) {
Map<Integer, String> row = new HashMap<>();
rows.put(rowIndex, row);
}
rows.get(rowIndex).put(colIndex, getText(cell, blockMap));
}
}
}
}
return rows;
}
public String getText(Block block, Map<String, Block> blockMap) {
String text = "";
if (block.relationships() != null && block.relationships().size() > 0) {
for (Relationship relationship : block.relationships()) {
if (relationship.type().equals(RelationshipType.CHILD)) {
for (String childId : relationship.ids()) {
Block wordBlock = blockMap.get(childId);
if (wordBlock != null && wordBlock.blockType() != null) {
if (wordBlock.blockType().equals(BlockType.WORD))) {
text += wordBlock.text() + " ";
}
}
}
}
}
}
return text;
}