如何将Dataset <row>列转换为非基本数据类型

时间:2019-03-26 12:50:41

标签: apache-spark apache-spark-sql

我有一个Dataset<Row>,其中有四列,其中两列是非原始数据类型List<Long> and List<String>

  +------+---------------+---------------------------------------------+---------------+
  |    Id| value         |     time                                      |aggregateType  |
  +------+---------------+---------------------------------------------+---------------+
  |0001  |  [1.5,3.4,4.5]| [1551502200000,1551502200000,1551502200000] | Sum             |
  +------+---------------+---------------------------------------------+---------------+

我有一个接受三个参数并返回DoubleUDF3<String,List<Long>,List<String>,Double>的UDF3。

所以当我调用UDF时,它会抛出一个异常

错误

caused by java.lang.classcastexception scala.collection.mutable.wrappedarray$ofref cannot be cast to java.lang.List

但是,如果我将类型更改为StringUDF3<String,String,String,Double>,就不会抱怨。

引发异常的代码

 UDF3<String,List<Long>,List<String>,Double> getAggregate = new UDF3<String,List<Long>,List<String>,Double>() {

 public Double call(String t1,List<Long> t2,List<String> t3) throws Exception {

 //do some process to return double

  return double;
  }

  sparkSession.udf().register("getAggregate_UDF",getAggregate, DataTypes.DoubleType);

  inputDS = inputDs.withColumn("value_new",callUDF("getAggregate_UDF",col("aggregateType"),col("time"),col("value")));

将所有类型更改为字符串后的代码

 UDF3<String,String,String,Double> getAggregate = new UDF3<String,String,String,Double>() {

 public Double call(String t1,String t2,String t3) throws Exception {

 //code to convert t2 and t3 to List<Long> and List<String> respectively

 //do some process to return double

  return double;
  }

  sparkSession.udf().register("getAggregate_UDF",getAggregate, DataTypes.DoubleType);

  inputDS = inputDs.withColumn("value_new",callUDF("getAggregate_UDF",col("aggregateType"),col("time").cast("String"),col("value").cast("String")));

上面的代码有效,但手动进行String to List转换。

需要帮助

I)如何在数据集中转换非原始数据类型List<Long> and List<String>以克服caused by java.lang.classcastexception scala.collection.mutable.wrappedarray$ofref cannot be cast to java.lang.List

II)请建议我是否有任何解决方法

谢谢。

2 个答案:

答案 0 :(得分:3)

您的UDF将始终接收WrappedArray实例而不是List,因为这是引擎存储它们的方式。

您需要编写如下内容:

public func getTransactionData(completion: @escaping (([Order]?) -> ())) {
    guard let userId = Auth.auth().currentUser?.uid else { completion(nil); return }
    let db = Firestore.firestore()
    let query = db.collection("order").whereField("account_id", 
isEqualTo: userId)
    query.getDocuments() { (querySnapshot, err) in
        if let err = err {
            print("Error getting documents: \(err)")
        } else {
            guard let querySnapshot = querySnapshot else { 
 completion(nil); return }
            var orders = [Order]()
            for document in querySnapshot.documents {
                let order = self.extractOrder(document)
                if let order = order {
                    orders.append(order)
                }
                }
                completion(orders)
            }
        }
    }

private func extractOrder(_ document: QueryDocumentSnapshot) -> Order? {
    print("document.data() is \(document.data())")
    let lineItems = extractLineItems(document.data()["line_items"] as? [[String:Any]] ?? [[:]])
        let orderId = document.documentID
    guard let balanceId = document.data()["balance_id"] as? String,
        let accountId = document.data()["account_id"] as? String,
        let subtotal = document.data()["subtotal"] as? Int,
        let date = document.data()["date"] as? Int,
        let totalAmount = document.data()["total_amount"] as? Int,
        let notes = document.data()["notes"] as? String,
        let rewardAmount = document.data()["reward_amount"] as? Int,
        let status = document.data()["status"] as? String,
        let tax = document.data()["tax_amount"] as? Int,
        let tip = document.data()["tip_amount"] as? Int,
        let balanceAmount = document.data()["balance_amount"] as? Int,
        let discountAmount = document.data()["discount_amount"] as? Int,
        let locationId = document.data()["location_id"] as? String
        else { return nil }
    let order = Order(totalAmount: totalAmount, subtotal: subtotal, discountAmount: discountAmount, tipAmount: tip, taxAmount: tax, balanceAmount: balanceAmount, rewardAmount: rewardAmount, balanceId: balanceId, accountId: accountId, locationId: locationId, date: date, status: status, orderType: "PICK UP", lineItems: lineItems, notes: notes, orderId: orderId)
    print("order is \(order)")
    return order
}

var modifiers: [Modifier]?
var toppings: [String]?

private func extractLineItems(_ dictionaryArray: [[String:Any]]) -> [MenuItem] {
    var lineItems = [MenuItem]()
    let count = dictionaryArray.count
    for x in 0..<count {
        guard let itemId = dictionaryArray[x]["item_id"] as? String,
            let category = dictionaryArray[x]["category"] as? String,
            let name = dictionaryArray[x]["name"] as? String,
            let description = dictionaryArray[x]["description"] as? String,
            let photoUrl = dictionaryArray[x]["photoUrl"] as? String,
            let basePrice = dictionaryArray[x]["base_item_price"] as? Int,
            let unitPrice = dictionaryArray[x]["unit_price"] as? Int,
            let totalPrice = dictionaryArray[x]["total_price"] as? Int,
            let quantity = dictionaryArray[x]["quantity"] as? Int,
            let size = dictionaryArray[x]["size"] as? String,
            let modifierKeys = dictionaryArray[x]["modifierKeys"] as? [String],
            let sizeAddOnPrice = dictionaryArray[x]["sizeAddOnPrice"] as? Int,
            let toppingsAddOnPrice = dictionaryArray[x]["toppingsAddOnPrice"] as? Int
            else { continue }
        if let modifiers = dictionaryArray[x]["modifiers"] as? [Modifier] {
            self.modifiers = modifiers
        } else {
            self.modifiers = nil
        }
        if let toppings = dictionaryArray[x]["toppings"] as? [String] {
            self.toppings = toppings
        } else {
            self.toppings = [String]()
        }
        let totalModPrice = sizeAddOnPrice + toppingsAddOnPrice
        let lineItem = MenuItem(itemId: itemId, name: name, modifiers: self.modifiers, photoUrl: photoUrl, quantity: quantity, basePrice: basePrice, unitPrice: unitPrice, totalPrice: totalPrice, totalModPrice: totalModPrice, sizeAddOnPrice: sizeAddOnPrice, toppingsAddOnPrice: toppingsAddOnPrice, description: description, size: size, toppings: self.toppings, category: category, modifierKeys: modifierKeys)
        lineItems.append(lineItem)
    }
    return lineItems
}

答案 1 :(得分:1)

这是我的示例,您必须使用WrappedArray接收数组并将其转换为列表

 /*
     +------+---------------+---------------------------------------------+---------------+
     |    Id| value         |     time                                      |aggregateType  |
     +------+---------------+---------------------------------------------+---------------+
     |0001  |  [1.5,3.4,4.5]| [1551502200000,1551502200000,1551502200000] | Sum             |
     +------+---------------+---------------------------------------------+---------------+
     **/

    StructType dataSchema = new StructType(new StructField[] {createStructField("Id", DataTypes.StringType, true),
                                                              createStructField("value",
                                                                                DataTypes.createArrayType(DataTypes.DoubleType,
                                                                                                          false),
                                                                                false),

                                                              createStructField("time",
                                                                                DataTypes.createArrayType(DataTypes.LongType,
                                                                                                          false),
                                                                                false),
                                                              createStructField("aggregateType",
                                                                                DataTypes.StringType,
                                                                                true),});

    List<Row> data = new ArrayList<>();

    data.add(RowFactory.create("0001",
                               Arrays.asList(1.5, 3.4, 4.5),
                               Arrays.asList(1551502200000L, 1551502200000L, 1551502200000L),
                               "sum"));
    Dataset<Row> example = spark.createDataFrame(data, dataSchema);
    example.show(false);

    UDF3<String, WrappedArray<Long>, WrappedArray<Double>, Double> myUDF = (param1, param2, param3) -> {

        List<Long> param1AsList = JavaConversions.seqAsJavaList(param2);
        List<Double> param2AsList = JavaConversions.seqAsJavaList(param3);

        //Example
        double myDoubleResult = 0;
        if ("sum".equals(param1)) {

            myDoubleResult = param2AsList.stream()
                                         .mapToDouble(f -> f)
                                         .sum();
        }

        return myDoubleResult;
    };

    spark.udf()
         .register("myUDF", myUDF, DataTypes.DoubleType);

    example = example.withColumn("new", callUDF("myUDF", col("aggregateType"), col("time"), col("value")));
    example.show(false);

您可以从github

获取它