我的spark sql查询API行为异常,例如:
我有以下数据集:
+---+-------------------------------+--------+
|key|value |someData|
+---+-------------------------------+--------+
|1 |AAA |5 |
|1 |VVV |6 |
|1 |DDDD |8 |
|3 |rrerw |9 |
|4 |RRRRR |13 |
|6 |AAAAABB |15 |
|6 |C:\Windows\System32\svchost.exe|20 |
+---+-------------------------------+--------+
其中值的类型为String
当我编写以下命令时,最后一行被过滤掉:
data.filter(col("value").notEqual("C:\\Windows\\System32\\svchost.exe")).show();
应用以下命令无法将最后一行过滤掉:
data.createTempView("temp");
spark.sql("select * from temp where value != 'C:\\Windows\\System32\\svchost.exe'").show();
按其他任何值过滤,例如“ AAA”有效。有什么建议吗?
编辑:完整代码:
public class main {
private static StructType schema = new StructType(new StructField[]{
DataTypes.createStructField("key", DataTypes.IntegerType, true),
DataTypes.createStructField("value", DataTypes.StringType, true),
DataTypes.createStructField("someData", DataTypes.IntegerType, true)
});
public static class Dummy implements Serializable {
int key;
String value;
int someData;
public Dummy() {
this.key = 1;
this.value = "";
this.someData = 0;
}
public Dummy(int key, String value, int someData) {
this.key = key;
this.value = value;
this.someData = someData;
}
public int getKey() {
return key;
}
public void setKey(int key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public int getSomeData() {
return someData;
}
public void setSomeData(int someData) {
this.someData = someData;
}
}
public static void main(String[] args) throws AnalysisException {
String master = "local[*]";
SparkConf sparkConfiguration = new SparkConf().setAppName("Test!").setMaster(master);
SparkSession spark = SparkSession.builder().config(sparkConfiguration).getOrCreate();
spark.sparkContext().setLogLevel("ERROR");
List<Dummy> intArray = Arrays.asList(new Dummy(1, "AAA", 5), new Dummy(1, "VVV", 6), new Dummy(1, "DDDD", 8), new Dummy(3, "rrerw", 9), new Dummy(4, "RRRRR", 13), new Dummy(6, "AAAAABB", 15), new Dummy(6, "C:\\Windows\\System32\\svchost.exe", 20));
Dataset<Dummy> data = spark.createDataset(intArray, Encoders.bean(Dummy.class));
Dataset<Row> rawMapping = data.map((MapFunction<Dummy, Row>) row -> RowFactory.create(
row.getKey(),
row.getValue() == "" ? null : row.getValue(),
row.getSomeData()
), RowEncoder.apply(schema));
System.out.println("rawMapping");
rawMapping.withColumn("id", monotonically_increasing_id()).show(false);
rawMapping.filter(col("value").notEqual("C:\\Windows\\System32\\svchost.exe")).show();
rawMapping.createTempView("temp");
spark.sql("select * from temp where value != 'VVV'").show();
spark.sql("select * from temp where value != 'C:\\Windows\\System32\\svchost.exe'").show();
}
}
答案 0 :(得分:0)
尝试"""
spark.sql("""select * from temp where value != 'C:\\Windows\\System32\\svchost.exe'""")
.show(false)