Question

我正在研究以下客户架构，我的任务是处理文件，每晚说1000，通过缓存将所有输入文件中的数据收集到数据帧中，最后制作一个数据库配置单元。问题是，在阅读过程中，在少数情况下“InvoiceData”缺失，我仍然需要通过添加缺少“InvoiceData”数组来处理它们缺少空值以便稍后查看它们，并且不要破坏该过程。在数据框中，我只是获取CompanyID，StoreID，StartTime，EndTime和“StoreData”。

我需要使用CompanyID，StoreID，StartTime，EndTime “InvoiceData”数组创建两个数据帧，其他包含CompanyID，StoreID，StartTime，EndTime和“StoreData”。

由于“InvoiceData”数组缺失，因此在准备数据时失败，我应该如何处理这种情况。

代码在python 2.7中

''' Current Customer Schema '''
root
 |-- CompanyID: string (nullable = true)
 |-- StoreID: string (nullable = true)
 |-- EndTime: double (nullable = true)
 |-- InvoiceData: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- TimeStamp: double (nullable = true)
 |    |    |-- productID: double (nullable = true)
 |    |    |-- productName: double (nullable = true)
 |    |    |-- productSKU: double (nullable = true)
 |    |    |-- productUPC: double (nullable = true)
 |-- StoreData: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- TimeStamp: double (nullable = true)
 |    |    |-- x: double (nullable = true)
 |    |    |-- y: double (nullable = true)
 |    |    |-- z: double (nullable = true)
 |-- StartTime: double (nullable = true)

这是我目前的pyspark代码

#!/bin/python2
import os
import subprocess
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import sys

spark = SparkSession \
        .builder \
        .enableHiveSupport() \
        .config("hive.exec.dynamic.partition", "true") \
        .config("hive.exec.dynamic.partition.mode", "nonstrict") \
        .getOrCreate()

sc = spark.sparkContext

def fn_customer(customer_data):
   df_customer = dict()
   df_customer = customer_data.select               \
           ( (col("Company_ID").alias("company_id"))   \
           , (col("StoreID").alias("Store_id"))        \
           , (col("StartTime").alias("start_time"))    \
           , (col("EndTime").alias("end_time"))        \
           , (explode(col("InvoiceData"))              \
                .alias("InvoiceData_ROW"))             \
           )
   df_customer.show(1)
   print ("Unexpexted Error df_customer : ", sys.exc_info()[0])

   print("Create df_customer  :")
   df_customer_invoice = customer_data.select  \
        ("company_id",  "Store_id" ,  "start_time" ,  "end_time" \
       ,(col("InvoiceData_ROW.TimeStamp").alias("time_stamp"))  \
       ,(col("InvoiceData_ROW.productID").alias("product_id"))  \
       ,(col("InvoiceData_ROW.productName").alias("product_name")) \
       ,(col("InvoiceData_ROW.productSKU").alias("product_SKU")) \
       ,(col("InvoiceData_ROW.productUPC").alias("product_UPC")) \
       ,((from_unixtime("start_time", "yyyy")).alias("year"))  \
       ,((from_unixtime("start_time", "MM")).alias("month")) \
       , ((from_unixtime("start_time", "dd")).alias("day")))

    print ("Unexpexted Error customer_data : ", sys.exc_info()[0])
#       df_customer_invoice.show(1)
    return df_customer_invoice
#       df_customer_invoice.show(1)
def fn_store(customer_data):
### Process to store data
    return df_customer_store

def fn_one_entry(df_customer_invoice):
   df_customer_incoice.createOrReplaceTempView("vw_customer")
   vw_customer.write.mode("append").insertInto("default.customer_invoice_table")
   print ("Unexpexted Error during append QUA : ", sys.exc_info()[0])

def main():
   df_customer_invoice = dict()
   df_customer_store = dict()

   ''' Ready file by file from source_location 1000 files '''

   cmd = 'hdfs dfs -find {} -name *.json'.format('source_location').split()
   files = subprocess.check_output(cmd).strip().split('\n')

   for path in files:
      filename = path.split(os.path.sep)[-1].split('.json')[0]
      json_file = path
      ''' I believe there is something wrong with this statement as well even though it is loop through will all files, 
      right now it is process all the files, only stop when ether one is missing  'CustomerData' and 'StoreData' from the file ... ''' 

      customer_datafile = spark.read.json(json_file).withColumn('json_file', input_file_name())
      custpmer_data = sensor_datafile.select ( '*'     \
                   , (lit(filename).alias("filename")) \
                 )
       ''' when file contain only 'StoreData' it fails, when it has both 'CustomerData' and 'StoreData', there is no issue ... '''
       customerdata = customer_data.where(col('CustomerData').isNotNull())
       df_customer_invoice = fn_customer(customerdata)
       df_customer_invoice = df_customer_invoice.cache()

       storedata = customer_data.where(col('StoreData').isNotNull())
       df_customer_store = fn_store(storedata)
       df_customer_store = df_customer_store.cache()

       invalid = df.where(col('InvoiceData').isNull() & col('StoreData').isNull())
       print("Invalid Customer data file : " )

    fn_one_entry(df_customer_invoice)

if __name__ == '__main__':
  try:
      main()
  except(KeyboardInterrupt, EOFError):
      print("\nAborting ... Keyboard Interrupt.")
      sys.exit(1)

Answer 1

我看到了修复代码的两种可能性：

1）为了检查数据帧中是否存在列，请不要使用if customer_data.select(col("InvoiceData"))（它抛出AnalysisException），但是

if 'InvoiceData' in customer_data.columns:
    do_somtehong_with_customer()
elif 'StoreData' in customer_data columns:
    do_something_with_store()
else:
    print("invalid file")
    continue

2）看起来您正在逐个处理文件，但架构可能会发生变化（因为某些文件根本没有InvoiceData信息）。这不是处理多个文件的有效方法，因为每个处理都是启动不同的spark作业。我建议一次读取所有文件（架构已满）并根据字段的可空性区分记录，例如：

cmd = 'hdfs dfs -find {} -name *.json'.format('source_location').split()
files = subprocess.check_output(cmd).strip().split('\n')
df = spark.read.json(files).withColumn('filename', input_file_name())

customers = df.where(col('InvoiceData').isNotNull())
stores = df.where(col('StoreData').isNotNull())
invalid = df.where(col('InvoiceData').isNull() & col('StoreData').isNull())

Answer 2

def hasColumn(df, col):
   try:
       df[col]
       return True
   except AnalysisException:
        return False

def main():
   df_customer_invoice = dict()
   df_customer_store = dict()

   ''' Ready file by file from source_location 1000 files '''
   cmd = 'hdfs dfs -find {} -name *.json'.format('source_location').split()
   files = subprocess.check_output(cmd).strip().split('\n')

   for path in files:
      filename = path.split(os.path.sep)[-1].split('.json')[0]
      json_file = path

      customer_datafile = spark.read.json(json_file).withColumn('json_file', input_file_name())
      customer_data = customer_datafile.select ( '*'     \
                   , (lit(filename).alias("filename")) \
                 )
       if hasColumn(customer_data, "InvoiceData"):
          if customer_data.select(explode(col('InvoiceData'))).count() > 0 :
             df_invoice = fn_customer(customerdata)
             df_customer_invoice = df_customer_invoice.union(df_invoice)

       if hasColumn(customer_data, "StoreData"):
          if customer_data.select(explode(col('StoreData'))).count() > 0 :
             df_store = fn_store(customer_data)
             df_customer_store = df_customer_store.union(df_store)

    fn_one_entry(df_customer_invoice)

使用pyspark检查缺少的json列

2 个答案: