带有XPathEntityProcessor的SOLR DataImportHandler

时间:2015-09-08 12:55:35

标签: xml solr dataimporthandler

当我使用DIH XPathEntityProcessor索引Resource XML文件时,应该有2个solr文件

  1. ID为1000的链接,带有2个标签ABC和DEF
  2. id为2000的链接有3个标签GHI,JKL和MNO
  3. Solr版本:4.10.2

    问题: 我无法正确索引数据。

    预期产出:

    {
        "id": "1000",
        "field_name": "val1",
        "ABC": "ABC_VALUE",
        "DEF": "DEF_VALUE"
    },
    {
        "id": "2000",
        "field_name": "val2",
        "GHI": "GHI_VALUE",
        "JKL": "JKL_VALUE",
        "MNO": "MNO_VALUE"
    }
    

    =============================================== ==========================

    资源XML:

    <RESOURCE>
        <LINK ID="1000">
            <FIELD>val1</FIELD>
            <TAG>
                <TAG_CODE>ABC</TAG_CODE>
                <TAG_VALUE>ABC_VALUE</TAG_VALUE>
            </TAG>
            <TAG>
                <TAG_CODE>DEF</TAG_CODE>
                <TAG_VALUE>DEF_VALUE</TAG_VALUE>
            </TAG>
        </LINK>
        <LINK ID="2000">
            <FIELD>val2</FIELD>
            <TAG>
                <TAG_CODE>GHI</TAG_CODE>
                <TAG_VALUE>GHI_VALUE</TAG_VALUE>
            </TAG>
            <TAG>
                <TAG_CODE>JKL</TAG_CODE>
                <TAG_VALUE>JKL_VALUE</TAG_VALUE>
            </TAG>
            <TAG>
                <TAG_CODE>MNO</TAG_CODE>
                <TAG_VALUE>MNO_VALUE</TAG_VALUE>
            </TAG>
        </LINK> 
    </RESOURCE>
    

    =============================================== ======================

    DataConfig XML(TRY 1):

    <dataConfig>
        <script><![CDATA[
            function f1(row) {
                var code = row.get("TAG_CODE");
                var val = row.get("TAG_VALUE");
    
                row.put(code, val);
                row.remove("TAG_CODE");
                row.remove("TAG_VALUE");
                return row;
            }
        ]]></script>
        <dataSource type="URLDataSource" />
        <document>
            <entity name="testdata" url="http://host:port/uri"
                    processor="XPathEntityProcessor" forEach="/RESOURCE/LINK">
                <field column="id" xpath="/RESOURCE/LINK/@ID" />    
                <field column="field_name" xpath="/RESOURCE/LINK/FIELD" />
                <entity name="testdata" url="http://host:port/uri"
                    processor="XPathEntityProcessor" forEach="/RESOURCE/LINK/TAG" transformer="script:f1">
                    <field column="TAG_CODE" xpath="/RESOURCE/LINK/TAG/TAG_CODE" />
                    <field column="TAG_VALUE" xpath="/RESOURCE/LINK/TAG/TAG_VALUE" />
                </entity>
            </entity>
        </document>
    </dataConfig>
    
    Output:
        {
            "id": "1000",
            "field_name": "val1",
            "ABC": "ABC_VALUE",
            "DEF": "DEF_VALUE",
            "GHI": "GHI_VALUE",
            "JKL": "JKL_VALUE",
            "MNO": "MNO_VALUE"
        },
        {
            "id": "2000",
            "field_name": "val2",
            "ABC": "ABC_VALUE",
            "DEF": "DEF_VALUE",
            "GHI": "GHI_VALUE",
            "JKL": "JKL_VALUE",
            "MNO": "MNO_VALUE"
        }
    

    =============================================== =================

    DataConfig XML(TRY 2):

    <dataConfig>
        <script><![CDATA[
            function f1(row) {
                var code = row.get("TAG_CODE");
                var val = row.get("TAG_VALUE");
    
                row.put(code, val);
                row.remove("TAG_CODE");
                row.remove("TAG_VALUE");
                return row;
            }
        ]]></script>
        <dataSource type="URLDataSource" />
        <document>
            <entity name="testdata" url="http://host:port/uri"
                    processor="XPathEntityProcessor" forEach="/RESOURCE/LINK">
                <field column="id" xpath="/RESOURCE/LINK/@ID" />    
                <field column="field_name" xpath="/RESOURCE/LINK/FIELD" />
                <entity name="testdata" url="http://host:port/uri"
                    processor="XPathEntityProcessor" forEach="/RESOURCE/LINK[@ID=${testdata.id}]/TAG" transformer="script:f1">
                    <field column="TAG_CODE" xpath="/RESOURCE/LINK/TAG/TAG_CODE" />
                    <field column="TAG_VALUE" xpath="/RESOURCE/LINK/TAG/TAG_VALUE" />
                </entity>
            </entity>
        </document>
    </dataConfig>
    
    Output:
        {
            "id": "1000",
            "field_name": "val1"        
        },
        {
            "id": "2000",
            "field_name": "val2"        
        }
    

    =============================================== =================

    DataConfig XML(TRY 3):

    <dataConfig>
        <script><![CDATA[
            function f1(row) {
                var code = row.get("TAG_CODE");
                var val = row.get("TAG_VALUE");
    
                row.put(code, val);
                row.remove("TAG_CODE");
                row.remove("TAG_VALUE");
                return row;
            }
        ]]></script>
        <dataSource type="URLDataSource" />
        <document>
            <entity name="testdata" url="http://host:port/uri"
                    processor="XPathEntityProcessor" forEach="/RESOURCE/LINK">
                <field column="id" xpath="/RESOURCE/LINK/@ID" />    
                <field column="field_name" xpath="/RESOURCE/LINK/FIELD" />
                <entity name="testdata" url="http://host:port/uri"
                    processor="XPathEntityProcessor" forEach="/RESOURCE/LINK[@ID=${testdata.id}]/TAG" transformer="script:f1">
                    <field column="TAG_CODE" xpath="/RESOURCE/LINK[@ID=${testdata.id}]/TAG/TAG_CODE" />
                    <field column="TAG_VALUE" xpath="/RESOURCE/LINK[@ID=${testdata.id}]/TAG/TAG_VALUE" />
                </entity>
            </entity>
        </document>
    </dataConfig>
    
    Output:
        {
            "id": "1000",
            "field_name": "val1"        
        },
        {
            "id": "2000",
            "field_name": "val2"        
        }
    

0 个答案:

没有答案