Am使用摄取附件插件处理器通过Java代码为pdf文档编制索引。创建索引pdf文档时我正在听取者超时。
我有将近150 GB的文档要建立索引。并且文件路径在oracle数据库中可用。
所以,首先,我使用logstash为我的oracle数据库表建立了索引,以获取文件路径(本地计算机驱动器)。然后使用Java代码对实际文件进行索引。
请在下面找到我的代码。
public class DocumentIndex {
private final static String INDEX = "documents_local"; //Source index
private final static String ATTACHMENT = "document_attachment"; //Destination index
private final static String TYPE = "doc";
private static final Logger logger = Logger.getLogger(Thread.currentThread().getStackTrace()[0].getClassName());
public static void main(String args[]) throws IOException {
RestHighLevelClient restHighLevelClient = null;
Document doc=new Document();
logger.info("Started Indexing the Document.....");
try {
restHighLevelClient = new RestHighLevelClient(RestClient.builder(new HttpHost("localhost", 9200, "http"),
new HttpHost("localhost", 9201, "http")));
} catch (Exception e) {
System.out.println(e.getMessage());
}
//Fetching Id, FilePath & FileName from Document Index.
SearchRequest searchRequest = new SearchRequest(INDEX);
searchRequest.types(TYPE);
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder qb = QueryBuilders.matchAllQuery();
searchSourceBuilder.query(qb);
searchSourceBuilder.size(3000);
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = null;
try {
searchResponse = restHighLevelClient.search(searchRequest);
} catch (IOException e) {
e.getLocalizedMessage();
}
SearchHit[] searchHits = searchResponse.getHits().getHits();
long totalHits=searchResponse.getHits().totalHits;
logger.info("Total Hits --->"+totalHits);
File all_files_path = new File("d:\\All_Files_Path.txt");
File available_files = new File("d:\\Available_Files.txt");
File missing_files = new File("d:\\Missing_Files.txt");
all_files_path.deleteOnExit();
available_files.deleteOnExit();
missing_files.deleteOnExit();
all_files_path.createNewFile();
available_files.createNewFile();
missing_files.createNewFile();
int totalFilePath=1;
int totalAvailableFile=1;
int missingFilecount=1;
Map<String, Object> jsonMap ;
for (SearchHit hit : searchHits) {
String encodedfile = null;
File file=null;
Map<String, Object> sourceAsMap = hit.getSourceAsMap();
if(sourceAsMap != null) {
doc.setId((int) sourceAsMap.get("id"));
doc.setApp_language(String.valueOf(sourceAsMap.get("app_language")));
}
String filepath=doc.getPath().concat(doc.getFilename());
logger.info("ID---> "+doc.getId()+"File Path --->"+filepath);
try(PrintWriter out = new PrintWriter(new FileOutputStream(all_files_path, true)) ){
out.println("FilePath Count ---"+totalFilePath+":::::::ID---> "+doc.getId()+"File Path --->"+filepath);
}
file = new File(filepath);
if(file.exists() && !file.isDirectory()) {
try {
try(PrintWriter out = new PrintWriter(new FileOutputStream(available_files, true)) ){
out.println("Available File Count --->"+totalAvailableFile+":::::::ID---> "+doc.getId()+"File Path --->"+filepath);
totalAvailableFile++;
}
FileInputStream fileInputStreamReader = new FileInputStream(file);
byte[] bytes = new byte[(int) file.length()];
fileInputStreamReader.read(bytes);
encodedfile = new String(Base64.getEncoder().encodeToString(bytes));
fileInputStreamReader.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
else
{
PrintWriter out = new PrintWriter(new FileOutputStream(missing_files, true));
out.println("Available File Count --->"+missingFilecount+":::::::ID---> "+doc.getId()+"File Path --->"+filepath);
missingFilecount++;
}
jsonMap = new HashMap<>();
jsonMap.put("id", doc.getId());
jsonMap.put("app_language", doc.getApp_language());
jsonMap.put("fileContent", encodedfile);
String id=Long.toString(doc.getId());
IndexRequest request = new IndexRequest(ATTACHMENT, "doc", id )
.source(jsonMap)
.setPipeline(ATTACHMENT);
PrintStream printStream = new PrintStream(new File("d:\\exception.txt"));
try {
IndexResponse response = restHighLevelClient.index(request);
} catch(ElasticsearchException e) {
if (e.status() == RestStatus.CONFLICT) {
}
e.printStackTrace(printStream);
}
totalFilePath++;
}
logger.info("Indexing done.....");
}
}
请找到我的映射文件
PUT _ingest/pipeline/document_attachment
{
"description" : "my first pipeline with handled exceptions",
"processors" : [
{
"attachment" : {
"field" : "fileContent",
"on_failure" : [
{
"set" : {
"field" : "error",
"value" : "{{ _ingest.on_failure_message }}"
}
}
]
}
}
]
}
PUT document_attachment
{
"settings": {
"analysis": {
"analyzer": {
"custom_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"char_filter": [
"html_strip"
],
"filter": [
"lowercase",
"asciifolding"
]
},
"product_catalog_keywords_analyzer": {
"type": "custom",
"tokenizer": "whitespace",
"char_filter": [
"html_strip"
],
"filter": [
"lowercase",
"asciifolding"
]
}
}
}
},
"mappings" : {
"doc" : {
"properties" : {
"attachment" : {
"properties" : {
"content" : {
"type" : "text",
"analyzer": "custom_analyzer"
},
"content_length" : {
"type" : "long"
},
"content_type" : {
"type" : "text"
},
"language" : {
"type" : "text"
}
}
},
"fileContent" : {
"type" : "text"
},
"id": {
"type": "long"
},
"app_language" : {
"type" : "text"
}
}
}
}
}
请查找将我的代码作为可执行jar文件执行时遇到的错误。
Exception in thread "main" java.io.IOException: listener timeout after waiting f
or [30000] ms
at org.elasticsearch.client.RestClient$SyncResponseListener.get(RestClie
nt.java:663)
at org.elasticsearch.client.RestClient.performRequest(RestClient.java:22
2)
at org.elasticsearch.client.RestClient.performRequest(RestClient.java:19
4)
at org.elasticsearch.client.RestHighLevelClient.performRequest(RestHighL
evelClient.java:443)
at org.elasticsearch.client.RestHighLevelClient.performRequestAndParseEn
tity(RestHighLevelClient.java:429)
at org.elasticsearch.client.RestHighLevelClient.index(RestHighLevelClien
t.java:312)
at com.es.utility.DocumentIndex.main(DocumentIndex.java:222)