Question

Am使用摄取附件插件处理器通过Java代码为pdf文档编制索引。创建索引pdf文档时我正在听取者超时。

我有将近150 GB的文档要建立索引。并且文件路径在oracle数据库中可用。

所以，首先，我使用logstash为我的oracle数据库表建立了索引，以获取文件路径（本地计算机驱动器）。然后使用Java代码对实际文件进行索引。

请在下面找到我的代码。

public class DocumentIndex {

    private final static String INDEX = "documents_local";  //Source index
    private final static String ATTACHMENT = "document_attachment"; //Destination index
    private final static String TYPE = "doc";
    private static final Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[0].getClassName());

    public static void main(String args[]) throws IOException {


        RestHighLevelClient restHighLevelClient = null;
        Document doc=new Document();

        logger.info("Started Indexing the Document.....");

        try {
            restHighLevelClient = new RestHighLevelClient(RestClient.builder(new HttpHost("localhost", 9200, "http"),
                    new HttpHost("localhost", 9201, "http")));
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }


        //Fetching Id, FilePath & FileName from Document Index. 
        SearchRequest searchRequest = new SearchRequest(INDEX); 
        searchRequest.types(TYPE);
        SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
        QueryBuilder qb = QueryBuilders.matchAllQuery();
        searchSourceBuilder.query(qb);
        searchSourceBuilder.size(3000);
        searchRequest.source(searchSourceBuilder);
        SearchResponse searchResponse = null;
        try {
             searchResponse = restHighLevelClient.search(searchRequest);
        } catch (IOException e) {
            e.getLocalizedMessage();
        }

        SearchHit[] searchHits = searchResponse.getHits().getHits();
        long totalHits=searchResponse.getHits().totalHits;
        logger.info("Total Hits --->"+totalHits);


        File all_files_path = new File("d:\\All_Files_Path.txt");
        File available_files = new File("d:\\Available_Files.txt");
        File missing_files = new File("d:\\Missing_Files.txt");
        all_files_path.deleteOnExit();
        available_files.deleteOnExit();
        missing_files.deleteOnExit();
        all_files_path.createNewFile();
        available_files.createNewFile();
        missing_files.createNewFile();

        int totalFilePath=1;
        int totalAvailableFile=1;
        int missingFilecount=1;

        Map<String, Object> jsonMap ;
        for (SearchHit hit : searchHits) {

            String encodedfile = null;
            File file=null;

            Map<String, Object> sourceAsMap = hit.getSourceAsMap();


            if(sourceAsMap != null) {  
                doc.setId((int) sourceAsMap.get("id"));
                doc.setApp_language(String.valueOf(sourceAsMap.get("app_language")));
             }

            String filepath=doc.getPath().concat(doc.getFilename());

            logger.info("ID---> "+doc.getId()+"File Path --->"+filepath);


            try(PrintWriter out = new PrintWriter(new FileOutputStream(all_files_path, true))  ){
                out.println("FilePath Count ---"+totalFilePath+":::::::ID---> "+doc.getId()+"File Path --->"+filepath);
            }

            file = new File(filepath);
            if(file.exists() && !file.isDirectory()) {
                try {
                      try(PrintWriter out = new PrintWriter(new FileOutputStream(available_files, true))  ){
                            out.println("Available File Count --->"+totalAvailableFile+":::::::ID---> "+doc.getId()+"File Path --->"+filepath);
                            totalAvailableFile++;
                        }
                    FileInputStream fileInputStreamReader = new FileInputStream(file);
                    byte[] bytes = new byte[(int) file.length()];
                    fileInputStreamReader.read(bytes);
                    encodedfile = new String(Base64.getEncoder().encodeToString(bytes));
                    fileInputStreamReader.close();
                } catch (FileNotFoundException e) {
                    e.printStackTrace();
                }
            }
            else
            {
                PrintWriter out = new PrintWriter(new FileOutputStream(missing_files, true));
                out.println("Available File Count --->"+missingFilecount+":::::::ID---> "+doc.getId()+"File Path --->"+filepath);
                missingFilecount++;
            }

            jsonMap = new HashMap<>();
            jsonMap.put("id", doc.getId());
            jsonMap.put("app_language", doc.getApp_language());
            jsonMap.put("fileContent", encodedfile);

            String id=Long.toString(doc.getId());

            IndexRequest request = new IndexRequest(ATTACHMENT, "doc", id )
                    .source(jsonMap)
                    .setPipeline(ATTACHMENT);

            PrintStream printStream = new PrintStream(new File("d:\\exception.txt"));
            try {
                IndexResponse response = restHighLevelClient.index(request);

            } catch(ElasticsearchException e) {
                if (e.status() == RestStatus.CONFLICT) {
                }
                e.printStackTrace(printStream);
            }

            totalFilePath++;


        }

        logger.info("Indexing done.....");

    }

}

请找到我的映射文件

PUT _ingest/pipeline/document_attachment
    {
  "description" : "my first pipeline with handled exceptions",
  "processors" : [
    {
      "attachment" : {
        "field" : "fileContent",
        "on_failure" : [
          {
            "set" : {
              "field" : "error",
              "value" : "{{ _ingest.on_failure_message }}"
            }
          }
        ]
      }
    }
  ]
}


PUT document_attachment
{

     "settings": {
     "analysis": {
      "analyzer": {
        "custom_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "char_filter": [
            "html_strip"
          ],
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        },
        "product_catalog_keywords_analyzer": {
          "type": "custom",
          "tokenizer": "whitespace",
          "char_filter": [
            "html_strip"
          ],
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        }
      }
    }
  },

  "mappings" : {
    "doc" : {
      "properties" : {
        "attachment" : {
          "properties" : {
            "content" : {
              "type" : "text",
              "analyzer": "custom_analyzer"
            },
            "content_length" : {
              "type" : "long"
            },
            "content_type" : {
              "type" : "text"
            },
            "language" : {
              "type" : "text"
            }
          }
        },
        "fileContent" : {
          "type" : "text"
        },
        "id": {
        "type": "long"
        },
        "app_language" : {
        "type" : "text"
        }
      }
    }
  }
}

请查找将我的代码作为可执行jar文件执行时遇到的错误。

Exception in thread "main" java.io.IOException: listener timeout after waiting f
or [30000] ms
        at org.elasticsearch.client.RestClient$SyncResponseListener.get(RestClie
nt.java:663)
        at org.elasticsearch.client.RestClient.performRequest(RestClient.java:22
2)
        at org.elasticsearch.client.RestClient.performRequest(RestClient.java:19
4)
        at org.elasticsearch.client.RestHighLevelClient.performRequest(RestHighL
evelClient.java:443)
        at org.elasticsearch.client.RestHighLevelClient.performRequestAndParseEn
tity(RestHighLevelClient.java:429)
        at org.elasticsearch.client.RestHighLevelClient.index(RestHighLevelClien
t.java:312)
        at com.es.utility.DocumentIndex.main(DocumentIndex.java:222)

ElasticSearch抛出“ java.io.IOException：等待f或[30000] ms后侦听器超时”

0 个答案: