代码

def ensure_table(spark, tablename, path):
    '''
    ensure table in hive, or register it from path
    '''
    global SPARK_TABLES
    if not SPARK_TABLES:
        SPARK_TABLES = [f'{i.database}.{i.name}'  for i in spark.catalog.listTables()]

    if tablename not in SPARK_TABLES:
        df = spark.read.parquet(path)
        df.registerTempTable(tablename)
    return spark

def do_something(spark):
    ensure_table(spark, 'products': 'hdfs:///data/ods/main/products/')

    sql = '''select product_id, store_id, max(price_guide) price_guide
    from main.products p
    left join main.stores s on s.city_zip = p.area_zip
    where p.is_deleted=0 and p.is_enabled=1

    df = spark.sql(sql)

    #...

看到这一点，我无法注册名为main.products的表（会出错），并且表名products不会在生产中使用。

PS：有许多表格需要执行此操作。

那么，有什么好主意吗？

如何在使用Spark SQL时忽略数据库名称（或使用数据库名称注册表）

代码

0 个答案: