solr搜索返回结果,但需要在文本搜索之前和之后的一些句子

时间:2012-06-15 09:48:40

标签: solr apache-tika

我正在使用 Apache Solr 3.6.0 。我用这个命令索引了一个文件:

 curl "http://localhost:8983/solr/update/extract?stream.file=/home/Desktop/DOCUMENTS/x.pdf&stream.contentType=application/pdf&literal.id=DOC_N&commit=true"

当我搜索文本时,例如:

"http://localhost:8983/solr/select/?q=Getting+Started&version=2.2&start=0&rows=10&indent=on"

结果返回:

<response>
    <lst name="responseHeader">
        <int name="status">0</int>
        <int name="QTime">12</int>
        <lst name="params">
            <str name="indent">on</str>
            <str name="start">0</str>
            <str name="q">Getting Started</str>
            <str name="version">2.2</str>
            <str name="rows">10</str>
        </lst>
    </lst>
    <result name="response" numFound="3" start="0">
        <doc>
            <arr name="content_type">
                <str>application/pdf</str>
            </arr>
            <str name="id">doc2</str>
        </doc>
        <doc>
            <arr name="content_type">
                <str>application/pdf</str>
            </arr>
            <str name="id">1</str>
        </doc>
        <doc>
            <arr name="content_type">
                <str>application/pdf</str>
            </arr>
            <str name="id">DOC_N</str>
        </doc>
    </result>
</response>

我从中得到的结果是在pdf中找到的结果。现在我想实现一个搜索:它将在&amp;之前返回一些文本在那个词之后,我从已编入索引的pdf中搜索。 请帮忙。

这是我的solrconfig.xml

<?xml version="1.0" encoding="UTF-8" ?>
<config>
    <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>
    <luceneMatchVersion>LUCENE_36</luceneMatchVersion>
    <lib dir="../../dist/" regex="apache-solr-cell-\d.*\.jar" />
    <lib dir="../../contrib/extraction/lib" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-clustering-\d.*\.jar" />
    <lib dir="../../contrib/clustering/lib/" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-dataimporthandler-\d.*\.jar" />
    <lib dir="../../contrib/dataimporthandler/lib/" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-langid-\d.*\.jar" />
    <lib dir="../../contrib/langid/lib/" regex=".*\.jar" />
    <lib dir="../../dist/" regex="apache-solr-velocity-\d.*\.jar" />
    <lib dir="../../contrib/velocity/lib" regex=".*\.jar" />
    <dataDir>${solr.data.dir:}</dataDir>
    <directoryFactory name="DirectoryFactory" 
                        class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>
    <indexConfig>
    </indexConfig>
    <jmx />
    <query>
        <maxBooleanClauses>1024</maxBooleanClauses>
        <filterCache class="solr.FastLRUCache"
                     size="512"
                     initialSize="512"
                     autowarmCount="0"/>
        <queryResultCache class="solr.LRUCache"
                          size="512"
                          initialSize="512"
                          autowarmCount="0"/>
        <documentCache class="solr.LRUCache"
                       size="512"
                       initialSize="512"
                       autowarmCount="0"/>
        <enableLazyFieldLoading>true</enableLazyFieldLoading>
        <queryResultWindowSize>20</queryResultWindowSize>
        <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
        <listener event="newSearcher" class="solr.QuerySenderListener">
            <arr name="queries">
            </arr>
        </listener>
        <listener event="firstSearcher" class="solr.QuerySenderListener">
            <arr name="queries">
                <lst>
                    <str name="q">static firstSearcher warming in solrconfig.xml</str>
                </lst>
            </arr>
        </listener>
        <useColdSearcher>false</useColdSearcher>
        <maxWarmingSearchers>2</maxWarmingSearchers>
    </query>
    <requestDispatcher>
        <requestParsers enableRemoteStreaming="true" 
                        multipartUploadLimitInKB="2048000" />
        <httpCaching never304="true" />
    </requestDispatcher>
    <requestHandler name="/select" class="solr.SearchHandler">
        <lst name="defaults">
            <str name="echoParams">explicit</str>
            <int name="rows">10</int>
            <str name="df">text</str>
        </lst>
    </requestHandler>
    <requestHandler name="/browse" class="solr.SearchHandler">
        <lst name="defaults">
            <str name="echoParams">explicit</str>
            <!-- VelocityResponseWriter settings -->
            <str name="wt">velocity</str>
            <str name="v.template">browse</str>
            <str name="v.layout">layout</str>
            <str name="title">Solritas</str>
            <str name="df">text</str>
            <str name="defType">edismax</str>
            <str name="q.alt">*:*</str>
            <str name="rows">10</str>
            <str name="fl">*,score</str>
            <str name="mlt.qf">
                text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
            </str>
            <str name="mlt.fl">text,features,name,sku,id,manu,cat</str>
            <int name="mlt.count">3</int>
            <str name="qf">
                text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
            </str>
            <str name="facet">on</str>
            <str name="facet.field">cat</str>
            <str name="facet.field">manu_exact</str>
            <str name="facet.query">ipod</str>
            <str name="facet.query">GB</str>
            <str name="facet.mincount">1</str>
            <str name="facet.pivot">cat,inStock</str>
            <str name="facet.range.other">after</str>
            <str name="facet.range">price</str>
            <int name="f.price.facet.range.start">0</int>
            <int name="f.price.facet.range.end">600</int>
            <int name="f.price.facet.range.gap">50</int>
            <str name="facet.range">popularity</str>
            <int name="f.popularity.facet.range.start">0</int>
            <int name="f.popularity.facet.range.end">10</int>
            <int name="f.popularity.facet.range.gap">3</int>
            <str name="facet.range">manufacturedate_dt</str>
            <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str>
            <str name="f.manufacturedate_dt.facet.range.end">NOW</str>
            <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str>
            <str name="f.manufacturedate_dt.facet.range.other">before</str>
            <str name="f.manufacturedate_dt.facet.range.other">after</str>
            <requestHandler name="/update/javabin" 
                            class="solr.BinaryUpdateRequestHandler" />
            <requestHandler name="/update/csv" 
                            class="solr.CSVRequestHandler" 
                            startup="lazy" />
            <requestHandler name="/update/json" 
                            class="solr.JsonUpdateRequestHandler" 
                            startup="lazy" />
            <requestHandler name="/update/extract" 
                            startup="lazy"
                            class="solr.extraction.ExtractingRequestHandler" >
                <lst name="defaults">
                    <!-- All the main content goes into "text"... if you need to return
                    the extracted text or do highlighting, use a stored field. -->
                    <str name="fmap.content">text</str>
                    <str name="lowernames">true</str>
                    <str name="uprefix">ignored_</str>
                    <!-- capture link hrefs but ignore div attributes -->
                    <str name="captureAttr">true</str>
                    <str name="fmap.a">links</str>
                    <str name="fmap.div">ignored_</str>
                </lst>
            </requestHandler>
            <requestHandler name="/update/xslt"
                            startup="lazy"
                            class="solr.XsltUpdateRequestHandler"/>
            <requestHandler name="/analysis/field" 
                            startup="lazy"
                            class="solr.FieldAnalysisRequestHandler" />
            <requestHandler name="/analysis/document" 
                            class="solr.DocumentAnalysisRequestHandler" 
                            startup="lazy" />
            <requestHandler name="/admin/" 
                            class="solr.admin.AdminHandlers" />
            <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
                <lst name="invariants">
                    <str name="q">solrpingquery</str>
                </lst>
                <lst name="defaults">
                    <str name="echoParams">all</str>
                </lst>
            </requestHandler>
            <!-- Echo the request contents back to the client -->
            <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
                <lst name="defaults">
                    <str name="echoParams">explicit</str> 
                    <str name="echoHandler">true</str>
                </lst>
            </requestHandler>
            <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
                <str name="queryAnalyzerFieldType">textSpell</str>
                <lst name="spellchecker">
                    <str name="name">default</str>
                    <str name="field">name</str>
                    <str name="spellcheckIndexDir">spellchecker</str>
                    <!-- uncomment this to require terms to occur in 1% of the documents 
                       in order to be included in the dictionary
                    -->
                </lst>
            </searchComponent>
            <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <str name="df">text</str>
                    <str name="spellcheck.onlyMorePopular">false</str>
                    <str name="spellcheck.extendedResults">false</str>
                    <str name="spellcheck.count">1</str>
                </lst>
                <arr name="last-components">
                    <str>spellcheck</str>
                </arr>
            </requestHandler>
            <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>
            <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <str name="df">text</str>
                    <bool name="tv">true</bool>
                </lst>
                <arr name="last-components">
                    <str>tvComponent</str>
                </arr>
            </requestHandler>
            <searchComponent name="clustering" 
                             enable="${solr.clustering.enabled:false}"
                             class="solr.clustering.ClusteringComponent" >
                <!-- Declare an engine -->
                <lst name="engine">
                    <!-- The name, only one can be named "default" -->
                    <str name="name">default</str>
                    <!-- Class name of Carrot2 clustering algorithm. 
                       Currently available algorithms are:
                       * org.carrot2.clustering.lingo.LingoClusteringAlgorithm
                       * org.carrot2.clustering.stc.STCClusteringAlgorithm
                       * org.carrot2.clustering.kmeans.BisectingKMeansClusteringAlgorithm
                       See http://project.carrot2.org/algorithms.html for the
                       algorithm's characteristics.
                    -->
                    <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>
                    <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>
                    <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>
                    <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
                </lst>
                <lst name="engine">
                    <str name="name">stc</str>
                    <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
                </lst>
            </searchComponent>
            <!-- A request handler for demonstrating the clustering component
           This is purely as an example.
               In reality you will likely want to add the component to your 
               already specified request handlers. 
            -->
            <requestHandler name="/clustering"
                            startup="lazy"
                            enable="${solr.clustering.enabled:false}"
                            class="solr.SearchHandler">
                <lst name="defaults">
                    <bool name="clustering">true</bool>
                    <str name="clustering.engine">default</str>
                    <bool name="clustering.results">true</bool>
                    <!-- The title field -->
                    <str name="carrot.title">name</str>
                    <str name="carrot.url">id</str>
                    <!-- The field to cluster on -->
                    <str name="carrot.snippet">features</str>
                    <!-- produce summaries -->
                    <bool name="carrot.produceSummary">true</bool>
                    <bool name="carrot.outputSubClusters">false</bool>
                    <str name="df">text</str>
                    <str name="defType">edismax</str>
                    <str name="qf">
                        text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
                    </str>
                    <str name="q.alt">*:*</str>
                    <str name="rows">10</str>
                    <str name="fl">*,score</str>
                </lst>     
                <arr name="last-components">
                    <str>clustering</str>
                </arr>
            </requestHandler>
            <searchComponent name="terms" class="solr.TermsComponent"/>
            <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <bool name="terms">true</bool>
                </lst>     
                <arr name="components">
                    <str>terms</str>
                </arr>
            </requestHandler>
            <searchComponent name="elevator" class="solr.QueryElevationComponent" >
                <!-- pick a fieldType to analyze queries -->
                <str name="queryFieldType">string</str>
                <str name="config-file">elevate.xml</str>
            </searchComponent>
            <!-- A request handler for demonstrating the elevator component -->
            <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
                <lst name="defaults">
                    <str name="echoParams">explicit</str>
                    <str name="df">text</str>
                </lst>
                <arr name="last-components">
                    <str>elevator</str>
                </arr>
            </requestHandler>
            <searchComponent class="solr.HighlightComponent" name="highlight">
                <highlighting>
                    <!-- Configure the standard fragmenter -->
                    <!-- This could most likely be commented out in the "default" case -->
                    <fragmenter name="gap" 
                                default="true"
                                class="solr.highlight.GapFragmenter">
                        <lst name="defaults">
                            <int name="hl.fragsize">100</int>
                        </lst>
                    </fragmenter>
                    <!-- A regular-expression-based fragmenter 
                       (for sentence extraction) 
                    -->
                    <fragmenter name="regex" 
                                class="solr.highlight.RegexFragmenter">
                        <lst name="defaults">
                            <!-- slightly smaller fragsizes work better because of slop -->
                            <int name="hl.fragsize">70</int>
                            <!-- allow 50% slop on fragment sizes -->
                            <float name="hl.regex.slop">0.5</float>
                            <!-- a basic sentence pattern -->
                            <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
                        </lst>
                    </fragmenter>
                    <!-- Configure the standard formatter -->
                    <formatter name="html" 
                               default="true"
                               class="solr.highlight.HtmlFormatter">
                        <lst name="defaults">
                            <str name="hl.simple.pre"><![CDATA[<em>]]></str>
                            <str name="hl.simple.post"><![CDATA[</em>]]></str>
                        </lst>
                    </formatter>
                    <encoder name="html" 
                             class="solr.highlight.HtmlEncoder" />
                    <fragListBuilder name="simple" 
                                     default="true"
                                     class="solr.highlight.SimpleFragListBuilder"/>
                    <!-- Configure the single fragListBuilder -->
                    <fragListBuilder name="single" 
                                     class="solr.highlight.SingleFragListBuilder"/>
                    <!-- default tag FragmentsBuilder -->
                    <fragmentsBuilder name="default" 
                                      default="true"
                                      class="solr.highlight.ScoreOrderFragmentsBuilder">
                        <!-- 
                        <lst name="defaults">
                          <str name="hl.multiValuedSeparatorChar">/</str>
                        </lst>
                        -->
                    </fragmentsBuilder>
                    <!-- multi-colored tag FragmentsBuilder -->
                    <fragmentsBuilder name="colored" 
                                      class="solr.highlight.ScoreOrderFragmentsBuilder">
                        <lst name="defaults">
                            <str name="hl.tag.pre"><![CDATA[
                   <b style="background:yellow">,<b style="background:lawgreen">,
                   <b style="background:aquamarine">,<b style="background:magenta">,
                   <b style="background:palegreen">,<b style="background:coral">,
                   <b style="background:wheat">,<b style="background:khaki">,
                   <b style="background:lime">,<b style="background:deepskyblue">]]></str>
                            <str name="hl.tag.post"><![CDATA[</b>]]></str>
                        </lst>
                    </fragmentsBuilder>
                    <boundaryScanner name="default" 
                                     default="true"
                                     class="solr.highlight.SimpleBoundaryScanner">
                        <lst name="defaults">
                            <str name="hl.bs.maxScan">10</str>
                            <str name="hl.bs.chars">.,!? &#9;&#10;&#13;</str>
                        </lst>
                    </boundaryScanner>
                    <boundaryScanner name="breakIterator" 
                                     class="solr.highlight.BreakIteratorBoundaryScanner">
                        <lst name="defaults">
                            <str name="hl.bs.type">WORD</str>
                            <str name="hl.bs.language">en</str>
                            <str name="hl.bs.country">US</str>
                        </lst>
                    </boundaryScanner>
                </highlighting>
            </searchComponent>
            <queryResponseWriter name="json" class="solr.JSONResponseWriter">
                <str name="content-type">text/plain; charset=UTF-8</str>
            </queryResponseWriter>
            <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/>
            <!-- XSLT response writer transforms the XML output by any xslt file found
               in Solr's conf/xslt directory.  Changes to xslt files are checked for
               every xsltCacheLifetimeSeconds.  
            -->
            <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
                <int name="xsltCacheLifetimeSeconds">5</int>
            </queryResponseWriter>
            <!-- Query Parsers
           http://wiki.apache.org/solr/SolrQuerySyntax
               Multiple QParserPlugins can be registered by name, and then
               used in either the "defType" param for the QueryComponent (used
               by SearchHandler) or in LocalParams
            -->
            <!-- example of registering a query parser -->
            <!--
             <queryParser name="myparser" class="com.mycompany.MyQParserPlugin"/>
            -->
            <admin>
                <defaultQuery>*:*</defaultQuery>
                <!-- configure a healthcheck file for servers behind a
                   loadbalancer 
                -->
                <!--
                 <healthcheck type="file">server-enabled</healthcheck>
                -->
            </admin>
            <searchComponent name="suggest_full" class="solr.SpellCheckComponent">
                <str name="queryAnalyzerFieldType">suggestTextFull</str>
                <lst name="spellchecker">
                    <str name="name">suggest_full</str>
                    <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
                    <str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>
                    <str name="field">text_suggest_full</str>
                    <str name="fieldType">suggestTextFull</str>
                </lst>
            </searchComponent>
            <requestHandler name="/suggest_full" class="org.apache.solr.handler.component.SearchHandler">
                <lst name="defaults">
                    <str name="echoParams">explicit</str>
                    <str name="spellcheck">true</str>
                    <str name="spellcheck.dictionary">suggest_full</str>
                    <str name="spellcheck.count">10</str>
                    <str name="spellcheck.onlyMorePopular">true</str>
                </lst>
                <arr name="last-components">
                    <str>suggest_full</str>
                </arr>
            </requestHandler>
            <requestHandler name="edismax" class="solr.SearchHandler" default="true">
                <lst name="defaults">
                    <str name="defType">edismax</str>
                    <str name="echoParams">explicit</str>
                    <float name="tie">0.1</float>
                    <str name="fl">keywords</str>
                    <str name="mm">1</str>
                    <str name="qf">kw_stopped^1.0 kw_phrases^5.0</str>
                    <str name="pf">kw_phrases^50.0</str>
                    <int name="ps">3</int>
                    <int name="qs">3</int>
                    <str name="q.alt">*:*</str>
                </lst>
            </requestHandler>
            <highlighting>
                <!-- Configure the standard fragmenter -->
                <!-- This could most likely be commented out in the "default" case -->
                <fragmenter name="gap" class="org.apache.solr.highlight.GapFragmenter" default="true">
                    <lst name="defaults">
                        <int name="hl.fragsize">100</int>
                    </lst>
                </fragmenter>
                <!-- A regular-expression-based fragmenter (f.i., for sentence extraction) -->
                <fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
                    <lst name="defaults">
                        <!-- slightly smaller fragsizes work better because of slop -->
                        <int name="hl.fragsize">70</int>
                        <!-- allow 50% slop on fragment sizes -->
                        <float name="hl.regex.slop">0.5</float>
                        <!-- a basic sentence pattern -->
                        <str name="hl.regex.pattern">[-\w ,/\n\"']{20,200}</str>
                    </lst>
                </fragmenter>
                <!-- Configure the standard formatter -->
                <formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
                    <lst name="defaults">
                        <str name="hl.simple.pre"><![CDATA[<em>]]></str>
                        <str name="hl.simple.post"><![CDATA[</em>]]></str>
                    </lst>
                </formatter>
            </highlighting>
            <!-- multi-colored tag FragmentsBuilder -->
            <fragmentsBuilder name="colored" class="org.apache.solr.highlight.ScoreOrderFragmentsBuilder">
                <lst name="defaults">
                    <str name="hl.tag.pre"><![CDATA[
             <b style="background:yellow">,<b style="background:lawgreen">,
             <b style="background:aquamarine">,<b style="background:magenta">,
             <b style="background:palegreen">,<b style="background:coral">,
             <b style="background:wheat">,<b style="background:khaki">,
             <b style="background:lime">,<b style="background:deepskyblue">]]></str>
                    <str name="hl.tag.post"><![CDATA[</b>]]></str>
                </lst>
            </fragmentsBuilder>
            <boundaryScanner name="breakIterator" class="solr.highlight.BreakIteratorBoundaryScanner">
                <lst name="defaults">
                    <str name="hl.bs.type">WORD</str>
                    <str name="hl.bs.language">en</str>
                    <str name="hl.bs.country">US</str>
                </lst>
            </boundaryScanner>
            <boundaryScanner name="simple" class="solr.highlight.SimpleBoundaryScanner" default="true">
                <lst name="defaults">
                    <str name="hl.bs.maxScan">10</str>
                    <str name="hl.bs.chars">.,!?\t\n</str>
                </lst>
            </boundaryScanner>
</config>

1 个答案:

答案 0 :(得分:0)

Solr突出显示提供突出搜索词语的选项,并返回匹配前后的文本。

您可以查看Solr highlight feature 查看Highlighting Parameters进行配置。