我不希望添加/提取任何元数据

时间:2014-03-10 15:39:40

标签: solr solr4 apache-tika

我正在使用Tika索引PDF文件,在Solr-UI中我可以看到很多元数据和其他我不关心的“东西”也被编入索引:

"response": {
    "numFound": 1,
    "start": 0,
    "docs": [
      {
        "meta": [
          "dc:subject",
          "",
          "meta:save-date",
          "2014-01-09T11:07:45Z",
          "subject",
          "",
          "Author",
          "smalik",
          "dcterms:created",
          "2014-01-09T11:07:45Z",
          "date",
          "2014-01-09T11:07:45Z",
          "creator",
          "smalik",
          "Creation-Date",
          "2014-01-09T11:07:45Z",
          "meta:author",
          "johndoe",
          "stream_content_type",
          "",
          "created",
          "Thu Jan 09 12:07:45 CET 2014",
          "stream_size",
          "null",
          "meta:keyword",
          "",
          "cp:subject",
          "",
          "xmp:CreatorTool",
          "PScript5.dll Version 5.2.2",
          "Keywords",
          "",
          "Last-Save-Date",
          "2014-01-09T11:07:45Z",
          "dc:title",
          "E-Mail zur Archivierung",
          "meta:creation-date",
          "2014-01-09T11:07:45Z",
          "dcterms:modified",
          "2014-01-09T11:07:45Z",
          "dc:creator",
          "johndoe",
          "Last-Modified",
          "2014-01-09T11:07:45Z",
          "modified",
          "2014-01-09T11:07:45Z",
          "xmpTPg:NPages",
          "1",
          "producer",
          "www.adlibsoftware.com:EXS41012-Windows 2008 R2:TNG",
          "Content-Type",
          "application/pdf"
        ],
        "div": [
          "page"
        ],
        "id": [
          "aaa11besd4effsujqub6toubqr4m3.pdf"
        ],
        "dc_subject": [
          ""
        ],
        "meta_save_date": [
          "2014-01-09T11:07:45Z"
        ],
        "subject": [
          ""
        ],
        "author": [
          "johndoe"
        ],
        "dcterms_created": [
          "2014-01-09T11:07:45Z"
        ],
        "date": [
          "2014-01-09T11:07:45Z"
        ],
        "creator": [
          "johndoe"
        ],
        "creation_date": [
          "2014-01-09T11:07:45Z"
        ],
        "title": [
          "E-Mail zur Archivierung"
        ],
        "meta_author": [
          "johndoe"
        ],
        "stream_content_type": [
          ""
        ],
        "created": [
          "Thu Jan 09 12:07:45 CET 2014"
        ],
        "stream_size": [
          "null"
        ],
        "meta_keyword": [
          ""
        ],
        "cp_subject": [
          ""
        ],
        "xmp_creatortool": [
          "PScript5.dll Version 5.2.2"
        ],
        "keywords": [
          ""
        ],
        "last_save_date": [
          "2014-01-09T11:07:45Z"
        ],
        "dc_title": [
          "E-Mail zur Archivierung"
        ],
        "meta_creation_date": [
          "2014-01-09T11:07:45Z"
        ],
        "dcterms_modified": [
          "2014-01-09T11:07:45Z"
        ],
        "dc_creator": [
          "johndoe"
        ],
        "last_modified": [
          "2014-01-09T11:07:45Z"
        ],
        "modified": [
          "2014-01-09T11:07:45Z"
        ],
        "xmptpg_npages": [
          "1"
        ],
        "producer": [
          "www.adlibsoftware.com:EXS41012-Windows 2008 R2:TNG"
        ],
        "content_type": [
          "application/pdf"
        ],
        "fullText": [" abcdef"],
        "uid": "d41d8cd98f00b204e9800998ecf8427e"
      }
    ]
  }

由于我只对“fullText”和“id”感兴趣,我想知道我在schema.xml和/或solrconfig.xml中设置/定义的方式/内容,以避免所有不必要的数据。

我想要的是这样的:

"response": {
        "numFound": 1,
        "start": 0,
        "docs": [
          {
            "id": [
              "aaa11besd4effsujqub6toubqr4m3.pdf"
            ],
            "fullText": [" abcdef"],
            "uid": "d41d8cd98f00b204e9800998ecf8427e"
          }
        ]
      }

实际上我的架构和solrconfig.xml看起来像这样:

<?xml version="1.0" encoding="UTF-8" ?>
<schema name="simple" version="1.1">
    <types>
        <fieldtype name="string" class="solr.StrField" postingsFormat="SimpleText" />
        <fieldtype name="text" class="solr.TextField" postingsFormat="SimpleText">
            <analyzer>
                <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\n" replacement=""/>
                <tokenizer class="solr.StandardTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each token. Leaves non-letter tokens alone.-->
                <filter class="solr.ClassicFilterFactory" /> <!--Removes dots from acronyms and 's from the end of tokens. Works only on typed tokens produced by ClassicTokenizer or equivalent.-->
                <filter class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. -->
                <filter class="solr.StopFilterFactory" ignoreCase="true"/> <!--Discards common words.  -->
                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
            </analyzer>
        </fieldtype>
    </types>

    <fields>
        <field name="uid" type="string" indexed="true" stored="true"
            multiValued="false" />
        <dynamicField name="*" type="string" multiValued="true"
            indexed="true" stored="true" />
        <field name="content" indexed="true"  type="text" multiValued="true" />
    </fields>

    <defaultSearchField>content</defaultSearchField>

    <solrQueryParser defaultOperator="OR" />
    <uniqueKey>uid</uniqueKey>
</schema>


<?xml version="1.0" encoding="UTF-8" ?>
<config>
    <luceneMatchVersion>LUCENE_45</luceneMatchVersion>
    <directoryFactory name='DirectoryFactory' class='solr.MMapDirectoryFactory' />

    <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />

    <lib dir='${solr.core.instanceDir}\lib' />
    <lib dir="${solr.core.instanceDir}\dist\" regex="solr-cell-\d.*\.jar" />
    <lib dir="${solr.core.instanceDir}\contrib\extraction\lib" regex=".*\.jar" />

    <requestHandler name="standard" class="solr.StandardRequestHandler" default="true" />

    <requestHandler name="/update" class="solr.UpdateRequestHandler">
        <lst name="defaults">
            <str name="update.chain">deduplication</str>
        </lst>
    </requestHandler>

    <requestHandler name="/update/extract"
        class="solr.extraction.ExtractingRequestHandler">
        <lst name="defaults">
            <str name="captureAttr">true</str>
            <str name="lowernames">true</str>
            <str name="overwrite">false</str>
            <str name="captureAttr">true</str>
            <str name="literalsOverride">true</str>
            <str name="uprefix">ignored_</str>
            <str name="fmap.a">link</str>
            <str name="fmap.content">fullText</str>
            <!-- the configuration here could be useful for tests -->
            <str name="update.chain">deduplication</str>
        </lst>
    </requestHandler>

    <updateRequestProcessorChain name="deduplication">
        <processor
            class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
            <bool name="overwriteDupes">false</bool>
            <str name="signatureField">uid</str>
            <bool name="enabled">true</bool>
            <str name="fields">content</str>
            <str name="minTokenLen">10</str>
            <str name="quantRate">.2</str>
            <str name="signatureClass">solr.update.processor.TextProfileSignature</str>
        </processor>
        <processor class="solr.LogUpdateProcessorFactory" />
        <processor class="solr.RunUpdateProcessorFactory" />
    </updateRequestProcessorChain>

    <requestHandler name="/admin/"
        class="org.apache.solr.handler.admin.AdminHandlers" />

    <lockType>none</lockType>

    <admin>
        <defaultQuery>*:*</defaultQuery>
    </admin>

</config>

1 个答案:

答案 0 :(得分:2)

请参阅Alexandre的答案和示例here。如果要获取不需要的字段,则需要在模式中显式声明它们,并将索引和存储设置为false(意味着Solr将忽略该字段)。您还可以使用动态字段忽略具有公共前缀或后缀的大量字符,这通常是Tika生成的文档的情况。