我在镶木地板上有外部蜂巢表,并且数据频繁添加。我有另一个应用程序,它每30分钟查询一次蜂巢表并进行一些验证。在Spark1.6中,我使用HiveContext进行查询,并且效果很好。在Spark2中,我注意到SQLContext替换了HiveContext,并且如果我使用SQLContext查询,则增量更改将不可用。下面是示例
-----------------------
EXTERNAL HIVE TABLE ON PARQUET
-----------------------
create external table table6(
id string,
db1field string,
created string,
empid string
)
stored as parquet
location '/table6';
import spark.implicits._
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
val rowsRdd: RDD[Row] = sc.parallelize(
Seq(
Row("firstrow", "1", "2018-10-12","12232"),
Row("secondrow", "2", "2018-10-12","23244"),
Row("thirdrow", "3", "2018-10-12","54434")
)
)
val schema = new StructType()
.add(StructField("id",StringType,true))
.add(StructField("db1field", StringType, true))
.add(StructField("created", StringType, true))
.add(StructField("empid", StringType, true))
val df = spark.createDataFrame(rowsRdd, schema)
df.write.mode("overwrite").parquet("/table6")
spark.sql("REFRESH TABLE table6")
spark.sql("select * from table6").show(false)
import org.apache.spark.sql.SparkSession
val sparksession=SparkSession.builder().enableHiveSupport().getOrCreate()
val df=sparksession.sql("select * from table6")
df.show(false)
val df2=sparksession.sqlContext.sql("select * from table6 ")
df2.show(false)
第二个spark2-shell中的查询无法获取第一个spark2-shell会话中所做的更改,而没有显式调用REFRESH TABLE。问题是我有40多个表,每个表都需要刷新。有没有更清洁的解决方案?