Solr4.1上的CopyField问题

时间:2013-01-30 14:02:41

标签: solr indexing solr4

我正在使用Solr 3.6.1,我非常满意。现在我想继续使用solr4.1。所以我使用了“schema.xml”和“solrconfig.xml”(稍作修改)并将它们放在我的新solr4.1配置下。索引成功(DIH)。但是,我注意到了一个问题。在“schema.xml”中,我有“copyField”指令,以便使用不同的“类型”索引相同的字段。当我尝试使用solr4.1上的相同配置进行索引时,索引大小是solr3.6.1上索引大小的一半(当我查询时得到不同的结果)。 Solr4.1有什么变化吗?我对此一点帮助。

schema.xml:

<?xml version="1.0" encoding="UTF-8" ?>

<schema name="areios_pagos" version="1.5">
  <types>
    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
    <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
    <!-- boolean type: "true" or "false" -->
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
    <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
    <fieldtype name="binary" class="solr.BinaryField"/>
    <!--
      Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
    -->
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>  
    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>

    <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>

    <fieldType name="pint" class="solr.IntField"/>
    <fieldType name="plong" class="solr.LongField"/>
    <fieldType name="pfloat" class="solr.FloatField"/>
    <fieldType name="pdouble" class="solr.DoubleField"/>
    <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/>

    <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>

    <fieldType name="random" class="solr.RandomSortField" indexed="true" />

    <!-- Greek -->
    <fieldType name="text_el" class="solr.TextField" positionIncrementGap="1000">
      <analyzer> 
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <!-- greek specific lowercase for sigma -->
        <filter class="solr.GreekLowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
        <filter class="solr.GreekStemFilterFactory"/>
      </analyzer>
    </fieldType>    

    <!-- THIS IS FOR TIKA-PDF -->   
    <fieldType name="text" class="solr.TextField" positionIncrementGap="1000">
        <analyzer type="index">
            <charFilter class="solr.HTMLStripCharFilterFactory"/>
            <tokenizer class="solr.StandardTokenizerFactory"/>  
            <filter class="solr.GreekLowerCaseFilterFactory"/>
            <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
            <filter class="solr.GreekStemFilterFactory"/>   
            <!--<filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />-->
        </analyzer>
        <analyzer type="query">
            <charFilter class="solr.HTMLStripCharFilterFactory"/>
            <tokenizer class="solr.StandardTokenizerFactory"/>  
            <filter class="solr.GreekLowerCaseFilterFactory"/>
            <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
            <filter class="solr.GreekStemFilterFactory"/>   
            <!--<filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />-->
        </analyzer>
    </fieldType>

    <!-- THIS FIELDTYPE IN CONJUNCTION WITH THE COPYFIELD DEFINITION  
         HELPS FOR TERMS RELEVANCY
    -->
    <fieldType name="text_areios_pagos_s" class="solr.TextField" positionIncrementGap="100" >
        <analyzer type="index">
            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
            <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="20"/>
            <filter class="solr.GreekLowerCaseFilterFactory"/>
            <!-- <filter class="solr.EdgeNGramFilterFactory" minGramSize="3" maxGramSize="100"/> -->
        </analyzer>
        <analyzer type="query">
            <tokenizer class="solr.WhitespaceTokenizerFactory"/>
            <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="20"/>
            <filter class="solr.GreekLowerCaseFilterFactory"/>
            <!-- <filter class="solr.EdgeNGramFilterFactory" minGramSize="3" maxGramSize="100"/> -->
        </analyzer>
    </fieldType> 
    <!-- END -->    

    <fieldType name="text_areios_pagos" class="solr.TextField" positionIncrementGap="100">    
      <analyzer type="index">       
        <tokenizer class="solr.StandardTokenizerFactory"/>      
        <filter class="solr.GreekLowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
        <filter class="solr.GreekStemFilterFactory"/>       
        <!--<filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />-->
      </analyzer>     
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>      
        <filter class="solr.GreekLowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
        <filter class="solr.GreekStemFilterFactory"/>       
      </analyzer>
    </fieldType>

    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

    <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>

    <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>

    <fieldtype name="geohash" class="solr.GeoHashField"/>

    <fieldType name="currency" class="solr.CurrencyField" precisionStep="8" defaultCurrency="USD" currencyConfig="currency.xml" />
 </types>



 <fields>
  <field  name="ida" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="solr_id" type="string" indexed="true" stored="true" multiValued="false"/> 
  <field  name="apofasi_number" type="text_areios_pagos" indexed="true" stored="true" multiValued="true"/>
  <field  name="apofasi_date" type="text_areios_pagos" indexed="true" stored="true"/>
  <field  name="apofasi_tmima" type="text_areios_pagos" indexed="true" stored="true"/>
  <field  name="apofasi_taxonomy" type="text_areios_pagos" indexed="true" stored="true"/>
  <field  name="content" type="text_areios_pagos" indexed="true" stored="true" multiValued="true"/> <!-- SET "multiValued=true" IN ORDER TO "copyField" --> 
  <field  name="type" type="string" indexed="true" stored="true"/>  
  <field  name="model" type="string" indexed="true" stored="true" multiValued="false"/>  
  <field  name="url" type="string" indexed="true" stored="true"/>
  <field  name="search_tag" type="text_areios_pagos" indexed="true" stored="true"/>
  <field  name="contentbin" type="text" indexed="true" stored="true" multiValued="true"/>
  <field  name="last_modified" type="string" indexed="true" stored="true"/>  
  <field  name="title" type="text_areios_pagos" indexed="true" stored="true" multiValued="true"/>
  <field  name="grid_title" type="text_areios_pagos" indexed="true" stored="true"/>
  <field  name="contentS" type="text_areios_pagos_s" indexed="true" stored="true"/>
 </fields>

 <uniqueKey>solr_id</uniqueKey>
 <defaultSearchField>content</defaultSearchField>
 <solrQueryParser defaultOperator="AND"/>

   <copyField source="apofasi_number" dest="content" />    
   <copyField source="apofasi_date" dest="content" />   
   <copyField source="apofasi_tmima" dest="content" />   
   <copyField source="apofasi_taxonomy" dest="content" />   
   <copyField source="title" dest="content" />    
   <copyField source="search_tag" dest="content" />
   <copyField source="contentbin" dest="content"/>     
   <copyField source="content" dest="contentS" />


</schema>

solrconfig.xml

<?xml version="1.0" encoding="UTF-8" ?>

<config>

  <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>


  <luceneMatchVersion>LUCENE_41</luceneMatchVersion>   

  <dataDir>${solr.data.dir:}</dataDir>

  <directoryFactory name="DirectoryFactory" 
                    class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>

  <indexConfig>

  </indexConfig>

  <jmx />

  <updateHandler class="solr.DirectUpdateHandler2">   
  </updateHandler>

  <!-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
       Query section - these settings control query time things like caches
       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -->
  <query>

    <maxBooleanClauses>2048</maxBooleanClauses>


    <filterCache class="solr.FastLRUCache"
                        size="2048"
                        initialSize="1024"
                        autowarmCount="512"
                        cleanupThread="true" />

    <queryResultCache class="solr.FastLRUCache"
                        size="2048"
                        initialSize="1024"
                        autowarmCount="512"
                        cleanupThread="true" />

    <documentCache class="solr.FastLRUCache"
                        size="2048"
                        initialSize="2048"
                        autowarmCount="512" />

    <fieldValueCache class="solr.FastLRUCache"
                        size="2048"
                        initialSize="512"
                        autowarmCount="512"
                        cleanupThread="true" />                 

    <enableLazyFieldLoading>true</enableLazyFieldLoading>

    <queryResultWindowSize>150</queryResultWindowSize>

    <queryResultMaxDocsCached>200</queryResultMaxDocsCached>   

    <listener event="newSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
        <lst>
          <str name="q">χρησικτησια νομη</str>
          <str name="fq">apofasi_taxonomy:ΠΟΛΙΤΙΚΕΣ</str>
          <str name="sort">apofasi_date asc,ida desc,apofasi_tmima desc</str>
          <str name="start">0</str>
          <str name="rows">150</str>
        </lst>
        <lst>
          <str name="q">νομη</str>
          <str name="fq">apofasi_taxonomy:ΠΟΛΙΤΙΚΕΣ</str>
          <str name="sort">apofasi_date asc,ida desc,apofasi_tmima desc</str>
          <str name="start">0</str>
          <str name="rows">150</str>
        </lst>
        <lst>
          <str name="q">χρησικτησια νομη</str>
          <str name="fq">apofasi_taxonomy:ΠΟΙΝΙΚΕΣ</str>
          <str name="sort">apofasi_date asc,ida desc,apofasi_tmima desc</str>
          <str name="start">0</str>
          <str name="rows">150</str>
        </lst>
      </arr>
    </listener>

    <listener event="firstSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
        <lst>
          <str name="q">χρησικτησια νομη</str>
          <str name="fq">apofasi_taxonomy:ΠΟΛΙΤΙΚΕΣ</str>
          <str name="sort">apofasi_date asc,ida desc,apofasi_tmima desc</str>
          <str name="start">0</str>
          <str name="rows">150</str>
        </lst>
        <lst>
          <str name="q">νομη</str>
          <str name="fq">apofasi_taxonomy:ΠΟΛΙΤΙΚΕΣ</str>
          <str name="sort">apofasi_date asc,ida desc,apofasi_tmima desc</str>
          <str name="start">0</str>
          <str name="rows">150</str>
        </lst>
        <lst>
          <str name="q">χρησικτησια νομη</str>
          <str name="fq">apofasi_taxonomy:ΠΟΙΝΙΚΕΣ</str>
          <str name="sort">apofasi_date asc,ida desc,apofasi_tmima desc</str>
          <str name="start">0</str>
          <str name="rows">150</str>
        </lst>
      </arr>
   </listener>

   <useColdSearcher>false</useColdSearcher>

   <maxWarmingSearchers>2</maxWarmingSearchers>

  </query>

  <requestDispatcher> 
    <requestParsers enableRemoteStreaming="true" multipartUploadLimitInKB="2048000" />    
    <httpCaching never304="true" />
  </requestDispatcher>

  <requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
    <lst name="defaults">
        <str name="config">data-config.xml</str>
    </lst>
  </requestHandler>

  <requestHandler name="/select" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="defType">edismax</str>
       <str name="qf">content contentS^10</str>
       <str name="pf">content^10 contentS^100</str>
       <str name="ps">100</str>
       <str name="echoParams">explicit</str>
       <int name="rows">150</int>
       <str name="sort">score desc</str>
       <str name="defType">edismax</str>
       <str name="qf">content contentS^10</str>
       <str name="pf">content^10 contentS^100</str>
       <str name="ps">100</str>
       <str name="wt">json</str>
       <str name="hl">true</str>       
       <str name="fl">solr_id,ida,type,model,keywordlist,title,apofasi_taxonomy,apofasi_tmima,apofasi_date,grid_title</str>
       <str name="hl.fl">content,title</str>
       <str name="f.content.hl.alternateField">content</str>
       <str name="hl.maxAlternateFieldLength">800</str>
       <str name="hl.fragsize">800</str>       
     </lst>  
  </requestHandler>

  <requestHandler name="/update" 
                  class="solr.XmlUpdateRequestHandler">
  </requestHandler>

  <requestHandler name="/update/javabin" 
                  class="solr.BinaryUpdateRequestHandler" />

  <requestHandler name="/update/csv" 
                  class="solr.CSVRequestHandler" 
                  startup="lazy" />

  <requestHandler name="/update/json" 
                  class="solr.JsonUpdateRequestHandler" 
                  startup="lazy" />

  <requestHandler name="/update/extract" 
                  startup="lazy"
                  class="solr.extraction.ExtractingRequestHandler" >
    <lst name="defaults">
      <str name="fmap.content">text</str>
      <str name="lowernames">true</str>
      <str name="uprefix">ignored_</str>      
      <str name="fmap.Last-Modified">last_modified</str>
      <str name="captureAttr">true</str>
      <str name="fmap.a">links</str>
      <str name="fmap.div">ignored_</str>
    </lst>
  </requestHandler>

  <requestHandler name="/update/xslt"
                   startup="lazy"
                   class="solr.XsltUpdateRequestHandler"/>

  <requestHandler name="/analysis/field" 
                  startup="lazy"
                  class="solr.FieldAnalysisRequestHandler" />

  <requestHandler name="/analysis/document" 
                  class="solr.DocumentAnalysisRequestHandler" 
                  startup="lazy" />

  <requestHandler name="/admin/" 
                  class="solr.admin.AdminHandlers" />


  <!-- ping/healthcheck -->
  <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
    <lst name="invariants">
      <str name="q">solrpingquery</str>
    </lst>
    <lst name="defaults">
      <str name="echoParams">all</str>
    </lst>
  </requestHandler>

  <!-- Echo the request contents back to the client -->
  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
    <lst name="defaults">
     <str name="echoParams">explicit</str> 
     <str name="echoHandler">true</str>
    </lst>
  </requestHandler>

  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
    <str name="queryAnalyzerFieldType">textSpell</str>
    <lst name="spellchecker">
      <str name="name">default</str>
      <str name="field">name</str>
      <str name="spellcheckIndexDir">spellchecker</str>     
    </lst>    
  </searchComponent>


  <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="df">text</str>
      <str name="spellcheck.onlyMorePopular">false</str>
      <str name="spellcheck.extendedResults">false</str>
      <str name="spellcheck.count">1</str>
    </lst>
    <arr name="last-components">
      <str>spellcheck</str>
    </arr>
  </requestHandler>  

  <searchComponent name="terms" class="solr.TermsComponent"/>

  <!-- A request handler for demonstrating the terms component -->
  <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
     <lst name="defaults">
      <bool name="terms">true</bool>
    </lst>     
    <arr name="components">
      <str>terms</str>
    </arr>
  </requestHandler>

  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
    <!-- pick a fieldType to analyze queries -->
    <str name="queryFieldType">string</str>
    <str name="config-file">elevate.xml</str>
  </searchComponent>

  <!-- A request handler for demonstrating the elevator component -->
  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="echoParams">explicit</str>
      <str name="df">text</str>
    </lst>
    <arr name="last-components">
      <str>elevator</str>
    </arr>
  </requestHandler>  

  <searchComponent class="solr.HighlightComponent" name="highlight">
    <highlighting>
      <fragmenter name="gap" 
                  default="true"
                  class="solr.highlight.GapFragmenter">
        <lst name="defaults">
        </lst>
      </fragmenter>
      <fragmenter name="regex" 
                  class="solr.highlight.RegexFragmenter">
        <lst name="defaults">
          <!-- slightly smaller fragsizes work better because of slop -->
          <int name="hl.fragsize">70</int>
          <!-- allow 50% slop on fragment sizes -->
          <float name="hl.regex.slop">0.5</float>
          <!-- a basic sentence pattern -->
          <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
        </lst>
      </fragmenter>
      <!-- Configure the standard formatter -->
      <formatter name="html" 
                 default="true"
                 class="solr.highlight.HtmlFormatter">
        <lst name="defaults">         
          <str name="hl.simple.pre">&lt;shl&gt;</str>
          <str name="hl.simple.post">&lt;/shl&gt;</str>       
        </lst>
      </formatter>

      <!-- Configure the standard encoder -->
      <encoder name="html" 
               class="solr.highlight.HtmlEncoder" />

      <!-- Configure the standard fragListBuilder -->
      <fragListBuilder name="simple" 
                       default="true"
                       class="solr.highlight.SimpleFragListBuilder"/>

      <!-- Configure the single fragListBuilder -->
      <fragListBuilder name="single" 
                       class="solr.highlight.SingleFragListBuilder"/>

      <!-- default tag FragmentsBuilder -->
      <fragmentsBuilder name="default" 
                        default="true"
                        class="solr.highlight.ScoreOrderFragmentsBuilder">    
      </fragmentsBuilder>

      <fragmentsBuilder name="colored" 
                        class="solr.highlight.ScoreOrderFragmentsBuilder">
        <lst name="defaults">
          <str name="hl.tag.pre"><![CDATA[
               <b style="background:yellow">,<b style="background:lawgreen">,
               <b style="background:aquamarine">,<b style="background:magenta">,
               <b style="background:palegreen">,<b style="background:coral">,
               <b style="background:wheat">,<b style="background:khaki">,
               <b style="background:lime">,<b style="background:deepskyblue">]]></str>
          <str name="hl.tag.post"><![CDATA[</b>]]></str>
        </lst>
      </fragmentsBuilder>

      <boundaryScanner name="default" 
                       default="true"
                       class="solr.highlight.SimpleBoundaryScanner">
        <lst name="defaults">
          <str name="hl.bs.maxScan">10</str>
          <str name="hl.bs.chars">.,!? &#9;&#10;&#13;</str>
        </lst>
      </boundaryScanner>

      <boundaryScanner name="breakIterator" 
                       class="solr.highlight.BreakIteratorBoundaryScanner">
        <lst name="defaults">

          <str name="hl.bs.type">WORD</str>

          <str name="hl.bs.language">en</str>
          <str name="hl.bs.country">US</str>
        </lst>
      </boundaryScanner>
    </highlighting>
  </searchComponent>

  <queryResponseWriter name="json" class="solr.JSONResponseWriter">

    <str name="content-type">text/plain; charset=UTF-8</str>
  </queryResponseWriter>


    <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/>


  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
    <int name="xsltCacheLifetimeSeconds">5</int>
  </queryResponseWriter>
  <admin>
    <defaultQuery>*:*</defaultQuery>


  </admin>

</config>

此致

汤姆

1 个答案:

答案 0 :(得分:4)

Solr 4.1以压缩的方式维护存储的字段,这可以解释索引大小的减少。

另外,

<copyField source="content" dest="contentS" />

文档@ http://wiki.apache.org/solr/SchemaXml#Copy_Fields

  

副本在流源级别完成,没有副本反馈到另一个副本。

将copyfield作为copyfield标记的来源不起作用 复制域源必须是实际字段,它具有一些值并且不会级联。

您还可以查看http://lucene.472066.n3.nabble.com/does-copyField-recurse-td2450208.html

这对你有用吗?