将数据库列拆分为多值Solr字段

时间:2013-01-24 22:33:28

标签: solr dih

我正在努力弄清楚如何使数据导入处理程序的splitBy结构起作用。我期待它将输入列拆分为多值字段。这是一个重现问题的测试用例:

import java.io.File;
import java.io.IOException;
import java.sql.SQLException;

import static org.junit.Assert.*;

import javax.sql.DataSource;

import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.io.FileUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.core.CoreContainer;
import org.hsqldb.jdbc.JDBCDataSource;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

public class TestSplitBy {

  SolrServer server;
  File configPath = new File(FileUtils.getTempDirectory(), Long.toString(System.nanoTime()));

  String solrconfig_xml = "<config><luceneMatchVersion>LUCENE_41</luceneMatchVersion><requestHandler name=\"search\" class=\"solr.SearchHandler\" default=\"true\"><lst name=\"defaults\"><str name=\"fl\">*</str><str name=\"df\">id</str></lst></requestHandler><requestHandler name=\"/dataimport\" class=\"org.apache.solr.handler.dataimport.DataImportHandler\"><lst name=\"defaults\"><str name=\"config\">data-config.xml</str></lst></requestHandler></config>";

  String data_config_xml = "<dataConfig>" +
      "<dataSource url=\"jdbc:hsqldb:mem:testdb\" user=\"SA\" driver=\"org.hsqldb.jdbc.JDBCDriver\" />" +
      "<document>" +
      "<entity name=\"item\" transformer=\"RegexTransformer\" query=\"SELECT * FROM test\">" +
      "<field column=\"type\" name=\"type\" splitBy=\",\" />" +
      "</entity>" +
      "</document>" +
      "</dataConfig>";

  String schema_xml = "<schema version=\"1.3\" name=\"test\">" +
      "<types>" +
      "<fieldType name=\"string\" class=\"solr.StrField\" sortMissingLast=\"true\" omitNorms=\"true\" />" +
      "</types>" +
      "<fields>" +
      "<field stored=\"true\" name=\"id\" type=\"string\" />" +
      "<field stored=\"true\" name=\"type\" type=\"string\" multiValued=\"true\"/>" +
      "</fields>" +
      "<uniqueKey>id</uniqueKey>" +
      "</schema>";

  DataSource getDataSource() {
    JDBCDataSource ds = new JDBCDataSource();
    ds.setUser("SA");
    ds.setUrl("mem:testdb");
    return ds;
  }

  void populateDb(DataSource ds) {
    QueryRunner runner = new QueryRunner(ds);
    try {
      runner.update("DROP TABLE test IF EXISTS");
      runner.update("CREATE TABLE test(id INTEGER, type VARCHAR(256));");
      runner.update("INSERT INTO test VALUES 1, 'foo,bar,baz'");
    } catch (SQLException e) {
      System.err.println(e);
    }
  }

  void writeSolrConfig() throws IOException {
    File corePath = new File(configPath, "collection1");
    corePath.mkdir();
    File confPath = new File(corePath, "conf");
    confPath.mkdir();
    FileUtils.write(new File(confPath, "data-config.xml"), data_config_xml);
    FileUtils.write(new File(confPath, "schema.xml"), schema_xml);
    FileUtils.write(new File(confPath, "solrconfig.xml"), solrconfig_xml);
  }

  void startSolr() {
    System.setProperty("solr.solr.home", configPath.getAbsolutePath());
    CoreContainer.Initializer initializer = new CoreContainer.Initializer();
    CoreContainer coreContainer = initializer.initialize();
    server = new EmbeddedSolrServer(coreContainer, "collection1");
  }

  @Before
  public void setup() throws IOException {
    populateDb(getDataSource());
    writeSolrConfig();
    startSolr();
  }

  @After
  public void tearDown() {
    server.shutdown();
    FileUtils.deleteQuietly(configPath);
  }

  @Test
  public void testSplitBy() throws Exception {
    SolrQuery query = new SolrQuery();
    query.set("qt", "/dataimport");
    query.setParam("command", "full-import");
    QueryResponse response = server.query(query);
    Thread.sleep(500);

    response = server.query(new SolrQuery("*:*"));
    for (SolrDocument doc: response.getResults()) {
      assertNotNull(doc.getFieldValues("type"));
      assertEquals(3, doc.getFieldValues("type").size());
    }
  }
}

测试用例的POM:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.example</groupId>
  <artifactId>solr</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <name>Solr Sanity</name>

  <build>
    <plugins>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>2.3.2</version>
        <configuration>
          <source>1.6</source>
          <target>1.6</target>
        </configuration>
      </plugin>
    </plugins>
  </build>

  <dependencies>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr</artifactId>
      <version>4.1.0</version>
      <type>war</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-dataimporthandler</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.apache.solr</groupId>
      <artifactId>solr-solrj</artifactId>
      <version>4.1.0</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>commons-dbutils</groupId>
      <artifactId>commons-dbutils</artifactId>
      <version>1.5</version>
      <type>jar</type>
    </dependency>
    <dependency>
      <groupId>org.hsqldb</groupId>
      <artifactId>hsqldb</artifactId>
      <version>2.2.9</version>
      <type>jar</type>
      <scope>runtime</scope>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
    </dependency>
    <dependency>
      <groupId>javax.servlet</groupId>
      <artifactId>servlet-api</artifactId>
      <version>2.5</version>
    </dependency>
  </dependencies>
</project>

有关如何将这些类型正确拆分为多个值的任何见解?

1 个答案:

答案 0 :(得分:4)

原来这个单元测试存在一些问题:

  1. HSQL的列名区分大小写(默认为大写)。

  2. 如果Solr字段名称和db列名称相同,则还会添加具有整个db值的额外令牌。

  3. 字段定义应如下所示:

  4. <field column="solrField" splitBy="," sourceColName="TYPE" />
    

    通常情况下 - 使用RegexTransformer将DB中的单值字段与多值字段混合使用时:

    • 如果使用splitBy,则column属性是Solr字段的名称。 sourceColName是数据库列

    • 如果使用splitBy,则column属性是数据库列名称,name属性是Solr字段。