在Solr中使用tika解析图像以及pdf中的文本(请求处理程序)

时间:2018-12-12 13:58:59

标签: indexing solr lucene apache-tika

我正在尝试使用solr 6为pdf文件编制索引,并希望提取图像(如果有)并将其保存到某个位置。我正在使用以下配置,但无法提取图像。我已经成功索引了pdf文本内容。

schema.xml

<?xml version="1.0" encoding="UTF-8" ?>
 <schema name="Breast-Cancer_PDFSchema" version="1.6">
 <uniqueKey>id</uniqueKey>
<field name="id" type="strings" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="date" type="tdates" indexed="true" stored="true"/>
<field name="pdf_pdfversion" type="strings" indexed="true" stored="true"/>
<field name="stream_content_type" type="strings" indexed="true" stored="true"/>
<field name="access_permission_modify_annotations" type="strings" indexed="true" stored="true"/>    
<field name="access_permission_can_print_degraded" type="strings" indexed="true" stored="true"/>
<field name="dcterms_created" type="strings" indexed="true" stored="true"/>
<field name="last_modified" type="strings" indexed="true" stored="true"/>
<field name="dcterms_modified" type="strings" indexed="true" stored="true"/>
<field name="dc_format" type="strings" indexed="true" stored="true"/>
<field name="last_save_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_fill_in_form" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_modified" type="strings" indexed="true" stored="true"/>
<field name="stream_name" type="strings" indexed="true" stored="true"/>
<field name="meta_save_date" type="strings" indexed="true" stored="true"/>
<field name="pdf_encrypted" type="strings" indexed="true" stored="true"/>
<field name="modified" type="strings" indexed="true" stored="true"/>
<field name="content_type" type="strings" indexed="true" stored="true"/>
<field name="stream_size" type="strings" indexed="true" stored="true"/>
<field name="x_parsed_by" type="strings" indexed="true" stored="true"/>
<field name="meta_creation_date" type="strings" indexed="true" stored="true"/>
<field name="stream_source_info" type="strings" indexed="true" stored="true"/>
<field name="created" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_for_accessibility" type="strings" indexed="true" stored="true"/>
<field name="access_permission_assemble_document" type="strings" indexed="true" stored="true"/>
<field name="xmptpg_npages" type="strings" indexed="true" stored="true"/>
<field name="creation_date" type="strings" indexed="true" stored="true"/>
<field name="access_permission_extract_content" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_print" type="strings" indexed="true" stored="true"/>
<field name="producer" type="strings" indexed="true" stored="true"/>
<field name="subject" type="strings" indexed="true" stored="true"/>
<field name="dc_creator" type="strings" indexed="true" stored="true"/>
<field name="aapl_keywords" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_producer" type="strings" indexed="true" stored="true"/>
<field name="resourcename" type="strings" indexed="true" stored="true"/>
<field name="access_permission_can_modify" type="strings" indexed="true" stored="true"/>
<field name="pdf_docinfo_created" type="strings" indexed="true" stored="true"/>
<field name="_text_" type="strings" indexed="true" stored="true"/>



<fieldType name="string" class="solr.StrField" sortMissingLast="true" docValues="true" />
<fieldType name="strings" class="solr.TextField" sortMissingLast="true" multiValued="true" />
<fieldType name="long" class="solr.TrieLongField" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="booleans" class="solr.BoolField" sortMissingLast="true" multiValued="true"/>
<fieldType name="tdates" class="solr.TrieDateField" positionIncrementGap="0" multiValued="true" precisionStep="6"/>
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>
<fieldType name="tdoubles" class="solr.TrieDoubleField" positionIncrementGap="0" multiValued="true" precisionStep="8"/>

solr-config.xml

  <requestHandler name="/update/extract" startup="lazy" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" >
<entries>
    <entry class="org.apache.tika.parser.pdf.AutoDetectParser"> </entry>
</entries>
<lst name="defaults">
  <str name="lowernames">true</str>
  <str name="fmap.meta">ignored_</str>
  <str name="fmap.content">_text_</str>
  <str name="fmap.id">id</str>
</lst>  
</requestHandler> 

我已经遵循了apache solr官方文档,但是根据它们对solr-config.xml进行的更改仍然存在相同的问题。

1 个答案:

答案 0 :(得分:0)

在阅读完您的问题后,如果我没记错的话,您将发布PDF文件(包含一些文本和图像)进行solr。 Solr只会索引该文档,而不会提取图像并将其存储到其他位置。

Solr在内部使用Tika库来解析文档,但是不能用于您的需求

要满足您的要求,

  1. 解析您的pdf并提取所有图像和其他内容并将其存储
  2. 在solr中为所有提取的内容和pdf编制索引。