试图从Hive数据库中的所有表中获取计数。 我有一个解决方案,效率高吗?还有更好的方法吗?
from pyspark.sql import functions as F
ds = 'dbname'
full_df = None
try:
for row in spark.sql("show tables in %s" %ds).collect():
py_df = (spark.table("%s.%s" % (ds,row.tableName))
.select(spark.table("%s.%s" % (ds,row.tableName)).columns[0])
.select
(
F.lit(row.tableName).alias('table'),
F.count('*').alias('num_records')
)
)
if full_df is None:
full_df = py_df
else:
full_df = full_df.unionAll(py_df)
except Exception as e:
print(str(e))
full_df.orderBy(F.col('num_records').desc()).show(truncate =False)
有什么建议吗?