我正在尝试将CSV文件加载到solr doc,我正在尝试使用scala。我是斯卡拉的新手。对于案例类结构,如果我传递一组值,它可以正常工作。但是,如果我想要从CSV中获取所有读取值,则会出错。我不知道如何在scala中做到这一点,任何帮助都非常感激。
object BasicParseCsv {
case class Person(id: String, name: String,age: String, addr: String )
val schema = ArrayBuffer[Person]()
def main(args: Array[String]) {
val master = args(0)
val inputFile = args(1)
val outputFile = args(2)
val sc = new SparkContext(master, "BasicParseCsv", System.getenv("SPARK_HOME"))
val params = new ModifiableSolrParams
val Solr = new HttpSolrServer("http://localhost:8983/solr/person1")
//Preparing the Solr document
val doc = new SolrInputDocument()
val input = sc.textFile(inputFile)
val result = input.map{ line =>
val reader = new CSVReader(new StringReader(line));
reader.readNext();
}
def getSolrDocument(person: Person): SolrInputDocument = {
val document = new SolrInputDocument()
document.addField("id",person.id)
document.addField("name", person.name)
document.addField("age",person.age)
document.addField("addr", person.addr)
document
}
def send(persons:List[Person]){
persons.foreach(person=>Solr.add(getSolrDocument(person)))
Solr.commit()
}
val people = result.map(x => Person(x(0), x(1),x(2),x(3)))
val book1 = new Person("101","xxx","20","abcd")
send(List(book1))
people.map(person => send(List(Person(person.id, person.name, person.age,person.addr))))
System.out.println("Documents added")
}
}
people.map(person => send(List(person(person.id,person.name,person.age,person.addr))))==>给出错误
val book1 = new Person(" 101"," xxx"," 20"," abcd")==>工作正常
更新:我得到以下错误 线程" main"中的例外情况org.apache.spark.SparkException:任务不可序列化 在org.apache.spark.util.ClosureCleaner $ .ensureSerializable(ClosureCleaner.scala:304) 在org.apache.spark.util.ClosureCleaner $ .org $ apache $ spark $ util $ ClosureCleaner $$ clean(ClosureCleaner.scala:294) 在org.apache.spark.util.ClosureCleaner $ .clean(ClosureCleaner.scala:122) 在org.apache.spark.SparkContext.clean(SparkContext.scala:2067) 在org.apache.spark.rdd.RDD $$ anonfun $ map $ 1.apply(RDD.scala:324) 在org.apache.spark.rdd.RDD $$ anonfun $ map $ 1.apply(RDD.scala:323) 在org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:150) 在org.apache.spark.rdd.RDDOperationScope $ .withScope(RDDOperationScope.scala:111) 在org.apache.spark.rdd.RDD.withScope(RDD.scala:316) 在org.apache.spark.rdd.RDD.map(RDD.scala:323) 在BasicParseCsv $ .main(BasicParseCsv.scala:90) 在BasicParseCsv.main(BasicParseCsv.scala) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) 在java.lang.reflect.Method.invoke(Method.java:497) 在com.intellij.rt.execution.application.AppMain.main(AppMain.java:144) 引起:java.io.NotSerializableException:org.apache.http.impl.client.SystemDefaultHttpClient 序列化堆栈: - 对象不可序列化(类:org.apache.http.impl.client.SystemDefaultHttpClient,值:org.apache.http.impl.client.SystemDefaultHttpClient@1dbd580) - field(类:org.apache.solr.client.solrj.impl.HttpSolrServer,name:httpClient,type:interface org.apache.http.client.HttpClient) - object(类org.apache.solr.client.solrj.impl.HttpSolrServer,org.apache.solr.client.solrj.impl.HttpSolrServer@17e0827) - field(类:BasicParseCsv $$ anonfun $ main $ 1,name:Solr $ 1,type:class org.apache.solr.client.solrj.impl.HttpSolrServer)