我必须索引一些XML文件。文件的结构是http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=24069587,1568,4069587,2598965,18000000,254875,587895&rettype=fasta&retmode=xml
我编辑了Solr配置文件:
数据-config.xml中
<dataConfig>
<dataSource type="FileDataSource" encoding="UTF-8" />
<document>
<entity name="PubmedArticle"
processor="XPathEntityProcessor"
stream="true"
pk="pmc"
forEach="/pmc-articleset/article"
url="C:\Users\hp\Desktop\idpSOlr - Copy\apache-solr-4.0.0\example\solr\collection1\conf\pmcsampleDownloaded.xml"
transformer="RegexTransformer,DateFormatTransformer"
>
<field column="journal-title" xpath="/pmc-articleset/article/front/journal-meta/journal-title-group/journal-title"/>
<filed column="pmc" xpath="/pmc-articleset/article/front/article-meta/article-id[@pub-id-type='pmc']"/>
<filed column="pmid" xpath="/pmc-articleset/article/front/article-meta/article-id[@pub-id-type='pmid']"/>
<filed column="other" xpath="/pmc-articleset/article/front/article-meta/article-id[@pub-id-type='other']"/>
<filed column="doi" xpath="/pmc-articleset/article/front/article-meta//article-id[@pub-id-type='doi']"/>
<field column="text" xpath="/pmc-articleset/article/front/article-meta/tittle-group/article-title" />
</entity>
</document>
和 的 schema.xml中
<schema name="pubmed" version="1.1">
<types>
<fieldType name="nametext" class="solr.TextField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="integer" class="solr.IntField" omitNorms="true"/>
<fieldType name="long" class="solr.LongField" omitNorms="true"/>
<fieldType name="float" class="solr.FloatField" omitNorms="true"/>
<fieldType name="double" class="solr.DoubleField" omitNorms="true"/>
<fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="date" class="solr.DateField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
<!-- A text field that only splits on whitespace for exact matching of words -->
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
<filter class="solr.PatternReplaceFilterFactory"
pattern="([^a-z])" replacement="" replace="all"
/>
</analyzer>
</fieldType>
<fieldtype name="ignored" stored="false" indexed="false" class="solr.StrField" />
<fieldType name="string" class="solr.StrField" sortMissingLast="true" />
</types>
<fields>
<field name="journal-title" type="text" indexed="true" stored="true" required="true" multiValued="false" />
<field name="pmc" type="string" indexed="true" stored="true" required="false" multiValued="false" />
<field name="pmid" type="string" indexed="true" stored="true" required="false" multiValued="false" />
<field name="other" type="string" indexed="true" stored="true" required="false" multiValued="false" />
<field name="doi" type="string" indexed="true" stored="true" required="false" multiValued="false" />
<field name="article-title" type="text" indexed="true" stored="true" required="false" multiValued="true" />
<!--<field name="surname" type="string" indexed="true" stored="true" required="true" multiValued="true" />
<field name="given-name" type="string" indexed="true" stored="true" required="true" multiValued="true" />
<field name="pub-date_day" type="long" indexed="true" stored="true" required="true" multiValued="true" />-->
<field name="text" type="text_general" indexed="true" stored="true" required="true" multiValued="true"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
</fields>
<uniqueKey>pmc</uniqueKey>
<solrQueryParser defaultOperator="OR"/>
</schema>
我已将请求处理程序配置为:
<requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
当我向Solr发出完全导入请求时
(http://localhost:8983/solr/dataimport?command=full-import
)
我第一次得到了这样的答复:
<response>
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">18</int>
</lst>
<lst name="initArgs">
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
</lst>
<str name="command">full-import</str>
<str name="status">idle</str>
<str name="importResponse"/>
<lst name="statusMessages"/>
<str name="WARNING">
This response format is experimental. It is likely to change in the future.
</str>
</response>
第二次,我收到了这个回复:
<response>
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">6</int>
</lst>
<lst name="initArgs">
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
</lst>
<str name="command">full-import</str>
<str name="status">idle</str>
<str name="importResponse"/>
<lst name="statusMessages">
<str name="Total Requests made to DataSource">0</str>
<str name="Total Rows Fetched">0</str>
<str name="Total Documents Skipped">0</str>
<str name="Full Dump Started">2013-10-12 20:45:12</str>
<str name="">
Indexing completed. Added/Updated: 0 documents. Deleted 0 documents.
</str>
<str name="Committed">2013-10-12 20:45:12</str>
<str name="Total Documents Processed">0</str>
<str name="Time taken">0:0:0.270</str>
</lst>
<str name="WARNING">
This response format is experimental. It is likely to change in the future.
</str>
</response>
当我查询Solr时,我发现没有文档被编入索引。任何人都可以帮我解决这个问题吗?架构和数据配置是否正确?