计算不正确的结果。
+--------+
|count(1)|
+--------+
| 0| // Should not be zero
+--------+
这是相关代码:
val customSchema = StructType(Array(
StructField("_c0", StringType, true),
StructField("Birth_Country", StringType, true),
StructField("Email", StringType, true),
StructField("First_Name", StringType, true),
StructField("Job", StringType, true),
StructField("Last_name", StringType, true),
StructField("SSN", StringType, true),
StructField("Loan_Approved", StringType, true),
StructField("Income", DoubleType, true)))
StructField("bytes_served", DoubleType, true)))
val df = spark.read.format("csv").option("header", `"true")
//.load("examples/src/main/resources/Fake_data.csv")
.schema(customSchema)
.load("examples/src/main/resources/Fake_data.csv")
df.show()
// System.exit(1)
df.select("Birth_Country", "Email","First_Name","Job","Last_name","Loan_Approved","SSN").show()
df.createOrReplaceTempView("people")
val sqlDF2 = spark.sql("SELECT Birth_Country, count(Birth_Country) as bm FROM people group by Birth_Country");`
sqlDF2.createOrReplaceTempView("BC")
val ans_1 = spark.sql("select Birth_Country, bm as count from BC where bm = (select max(bm) from BC)")
val ans_2 = spark.sql("select mean(Income) from people where Birth_Country='United States of America'")
val ans_4=spark.sql("select count(*) from people where Income > 1000.00 ")
ans_1.show
ans_2.show
ans_4.show
数据就像......第一个字段为空
Birth_Country Email First_Name Income Job Last_name Loan_Approved SSN
----------
0 Bosnia and Herzegovina emily15@whitehead.com Melissa 109957.00 Telecommunications researcher Miranda FALSE 129-41-7773
1 Belgium ronald87@yahoo.com Curtis 301884.00 Animal nutritionist Garrett TRUE 212-74-3976
2 United Kingdom hannah29@gmail.com Connor 341584.00 English as a foreign language teacher Steele FALSE 024-35-3834
3 Kiribati derrick59@hotmail.com Adam 448293.00 Surveyor, commercial/residential Newman FALSE 157-82-4486