如何使用solr索引nutch中的插件字段?

时间:2014-08-12 08:17:08

标签: solr nutch

我整合了nutch / solr / hbase来构建搜索引擎,它运行良好,除了schma.xml中的某些文件没有索引到solr。 schema.xml喜欢这个:

<schema name="nutch" version="1.5">
    <types>
    <fieldType name="string" class="solr.StrField" sortMissingLast="true"
        omitNorms="true"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
        omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
        omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
        omitNorms="true" positionIncrementGap="0"/>
    <fieldType name="text" class="solr.TextField"
        positionIncrementGap="100">
        <analyzer>
            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
            <filter class="solr.StopFilterFactory"
                ignoreCase="true" words="stopwords.txt"/>
            <filter class="solr.WordDelimiterFilterFactory"
                generateWordParts="1" generateNumberParts="1"
                catenateWords="1" catenateNumbers="1" catenateAll="0"
                splitOnCaseChange="1"/>
            <filter class="solr.LowerCaseFilterFactory"/>
            <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
        </analyzer>
      </fieldType>
      <fieldType name="url" class="solr.TextField"
        positionIncrementGap="100">
        <analyzer>
            <tokenizer class="solr.StandardTokenizerFactory"/>
            <filter class="solr.LowerCaseFilterFactory"/>
            <filter class="solr.WordDelimiterFilterFactory"/>
        <analyzer>
            <tokenizer class="solr.StandardTokenizerFactory"/>
            <filter class="solr.LowerCaseFilterFactory"/>
            <filter class="solr.WordDelimiterFilterFactory"
                generateWordParts="1" generateNumberParts="1"/>
        </analyzer>
    </fieldType>
</types>
<fields>
    <field name="id" type="string" stored="true" indexed="true"/>

    <!-- core fields -->
    <field name="batchId" type="string" stored="true" indexed="false"/>
    <field name="digest" type="string" stored="true" indexed="false"/>
    <field name="boost" type="float" stored="true" indexed="false"/>

    <!-- fields for index-basic plugin -->
    <field name="host" type="url" stored="false" indexed="true"/>
    <field name="url" type="url" stored="true" indexed="true"
        required="true"/>
    <field name="content" type="text" stored="true" indexed="true"/>
    <field name="title" type="text" stored="true" indexed="true"/>
    <field name="cache" type="string" stored="true" indexed="false"/>
    <field name="tstamp" type="date" stored="true" indexed="true"/>

    <field name="_version_" type="long" indexed="true" stored="true"/>
    <!-- fields for index-anchor plugin -->
    <field name="anchor" type="string" stored="true" indexed="true"
        multiValued="true"/>

    <!-- fields for index-more plugin -->
    <field name="type" type="string" stored="true" indexed="true"
        multiValued="true"/>
    <field name="contentLength" type="long" stored="true"
        indexed="true"/>
    <field name="lastModified" type="date" stored="true"
        indexed="true"/>
    <field name="date" type="date" stored="true" indexed="true"/>

    <!-- fields for languageidentifier plugin -->
    <field name="lang" type="string" stored="true" indexed="true"/>

    <!-- fields for subcollection plugin -->
    <field name="subcollection" type="string" stored="true"
        indexed="true" multiValued="true"/>

    <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
    <field name="author" type="string" stored="true" indexed="true"/>
    <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
    <field name="feed" type="string" stored="true" indexed="true"/>
    <field name="publishedDate" type="date" stored="true"
        indexed="true"/>
    <field name="updatedDate" type="date" stored="true"
        indexed="true"/>

    <!-- fields for creativecommons plugin -->
    <field name="cc" type="string" stored="true" indexed="true"
        multiValued="true"/>

    <!-- fields for tld plugin -->
    <field name="tld" type="string" stored="false" indexed="false"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>content</defaultSearchField>
<solrQueryParser defaultOperator="OR"/>
</schema>

“ - core fields - ”和“ - index-basic plugin的字段 - ”中的字段被索引到solr,但是其他字段,例如“ - fields for index-anchor plugin”中的字段 - “ - index for more-plugin的字段 - 不是。

那有什么问题?

2 个答案:

答案 0 :(得分:2)

也许您忘了在nutch-default或nutch-site文件中激活这些插件。

<property>
 <name>plugin.includes</name>
 <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more)|scoring- opic|urlnormalizer-(pass|regex|basic)</value>
</property>

然后你可能想在solrindex-mapping.xml文件中添加它们......

<fields>
 <field dest="content" source="content"/>
 <field dest="title" source="title"/>
 <field dest="host" source="host"/>
 <field dest="segment" source="segment"/>
 <field dest="boost" source="boost"/>
 <field dest="digest" source="digest"/>
 <field dest="tstamp" source="tstamp"/>
 <field dest="anchor" source="anchor"/>
 <field dest="type" source="type"/>
 <field dest="id" source="url"/>
 <copyField source="url" dest="url"/>
</fields>
<uniqueKey>id</uniqueKey>

编译Nutch并进行新的爬网,您应该能够在solr中看到index-more和index-anchor字段。

答案 1 :(得分:0)

这个配置在nutch-site.xml中,或者在你的情况下可能是nutch-default.xml

<property>
<name>plugin.includes</name>
<value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
</property>