我必须索引PDF列表(PDF-A),其中一些没有问题,但对于其他人,当我查看索引内容时,我只会看到很多带有问号的钻石。< / p>
我认为问题是用于文档的字体,或者内容是&#34;封装的&#34;拍照。
有没有办法告诉tika只提取&#34;可读/可解析的&#34; pdf的文字?
当我查询所有文档(使用我的java应用程序)时,这是一个例子。我在日志文件中看到有问题的文件内容:
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xe8]?[0x1]d41d8cd98f00b204e9800998ecf8427e[0xb][0xa4][0xe5][0x81](Diverses[0xe6]=aabhpdtyan3vfsujquccemebqr4m3[0xe7][0x81]?[0xc1][0x4] [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " E-Mail zur Archivierung [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0x9][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">
DEBUG org.apache.http.wire - << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\r][\n]">
DEBUG org.apache.http.wire - << "[0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][0xef][0xbf][0xbd][\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0xef][0xbf][0xbd] [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [\n]">
DEBUG org.apache.http.wire - << " [0x9] data1.pdf [\n]">
另一个问题是,对于内容字段开头的所有文件(也是&#34;好的&#34;),有一个很长的\n
列表,您也可以在上面看到。怎么能避免这个?
这是我的schema.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="simple" version="1.1">
<types>
<fieldtype name="string" class="solr.StrField" postingsFormat="SimpleText" />
<fieldtype name="ignored" class="solr.TextField" />
<fieldtype name="text" class="solr.TextField" postingsFormat="SimpleText">
<analyzer>
<charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\n" replacement=""/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each token. Leaves non-letter tokens alone.-->
<filter class="solr.ClassicFilterFactory" /> <!--Removes dots from acronyms and 's from the end of tokens. Works only on typed tokens produced by ClassicTokenizer or equivalent.-->
<filter class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. -->
<filter class="solr.StopFilterFactory" ignoreCase="true"/> <!--Discards common words. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="signatureField" type="string" indexed="true" stored="true" multiValued="false" />
<dynamicField name="ignored_*" type="ignored" multiValued="true" indexed="false" stored="false" />
<field name="id" type="string" indexed="true" stored="true" multiValued="false" />
<field name="rmDocumentTitle" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="fullText" indexed="true" type="text" multiValued="true" />
</fields>
<defaultSearchField>fullText</defaultSearchField>
<solrQueryParser defaultOperator="OR" />
<uniqueKey>id</uniqueKey>
</schema>
和我的solrconfig.xml:
<?xml version="1.0" encoding="UTF-8" ?>
<config>
<luceneMatchVersion>LUCENE_45</luceneMatchVersion>
<directoryFactory name='DirectoryFactory' class='solr.MMapDirectoryFactory' />
<codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />
<lib dir='${solr.core.instanceDir}\lib' />
<lib dir="${solr.core.instanceDir}\dist\" regex="solr-cell-\d.*\.jar" />
<lib dir="${solr.core.instanceDir}\contrib\extraction\lib" regex=".*\.jar" />
<requestHandler name="standard" class="solr.StandardRequestHandler" default="true" />
<requestHandler name="/update" class="solr.UpdateRequestHandler">
<lst name="defaults">
<str name="update.chain">deduplication</str>
</lst>
</requestHandler>
<requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler">
<lst name="defaults">
<str name="captureAttr">true</str>
<str name="lowernames">false</str>
<str name="overwrite">false</str>
<str name="captureAttr">true</str>
<str name="literalsOverride">true</str>
<str name="uprefix">ignored_</str>
<str name="fmap.a">link</str>
<str name="fmap.content">fullText</str>
<!-- the configuration here could be useful for tests -->
<str name="update.chain">deduplication</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="deduplication">
<processor
class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
<bool name="overwriteDupes">false</bool>
<str name="signatureField">signatureField</str>
<bool name="enabled">true</bool>
<str name="fields">content</str>
<str name="minTokenLen">10</str>
<str name="quantRate">.2</str>
<str name="signatureClass">solr.update.processor.TextProfileSignature</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<requestHandler name="/admin/"
class="org.apache.solr.handler.admin.AdminHandlers" />
<lockType>none</lockType>
<admin>
<defaultQuery>*:*</defaultQuery>
</admin>
</config>
答案 0 :(得分:0)
Re:Diamond with a question mark
- 它是一个非UTF-8字符。见Why does a diamond with a questionmark in it � appear in my HTML?
尝试使用ASCIIFoldingFilterFactory并查看它是否适合您。
<强>更新强>
由于这不起作用,您可以尝试在分析器链中使用它来排除所有非ASCII字符(以SPACE开头)吗?
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="([^\x20-\x7F])"
replacement=""/>