I am having two DataFrameReader
s with different options to prepare dataset, but I am not getting expected result.
DataFrameReader reader = getSparkSession().read().option("nullValue", DataAnalyzerUtils.NULL_VALUE).option("inferSchema", "true").option("mode", "DROPMALFORMED");
DataFrameReader customReader = getSparkSession().read().option("nullValue", DataAnalyzerUtils.BLANK).option("inferSchema", "true").option("mode", "DROPMALFORMED");
Dataset<Row> inputDataset = reader.csv(file.getName());
Dataset<Row> customDataset = customDataset = customReader.csv(file.getName());
final String table = "table" + System.currentTimeMillis();
final String customTable = "customtable" + System.currentTimeMillis();
inputDataset .createOrReplaceTempView(table );
customDataset.createOrReplaceTempView(customTable);
String blankQuery = DataAnalyzerUtils.getNonBlankCountColumn("", customTable , column.getName());
Dataset<Row> result = sparkSession.sql(blankQuery);
Row firstRow = result.first();
if (result.count() != 0 && firstRow.size() > 0) {
if (!firstRow.isNullAt(0)) {
System.out.println("Count:"+firstRow.size());
// Here I am getting always value as 0 for every column, why?
}
}
private static String getNonBlankCountColumn(String dbName, String tbName, String columnName) {
return "select count(" + columnName + ") from " + CommonUtils.getQualifiedTable(dbName, tbName)
+ " where length(" + columnName + ") != 0";
}