我刚刚在spark中发现了类序列化的错误。
=>现在,我想进行单元测试,但我不知道怎么做?
注意:
答案 0 :(得分:1)
研究火花广播代码,我找到了一种方法。但它使用私有火花代码,因此如果火花内部发生变化,它可能会变得无效。但它仍然有效。
在以org.apache.spark
开头的包中添加测试类,例如:
package org.apache.spark.my_company_tests
// [imports]
/**
* test data that need to be broadcast in spark (using kryo)
*/
class BroadcastSerializationTests extends FlatSpec with Matchers {
it should "serialize a transient val, which should be lazy" in {
val data = new MyClass(42) // data to test
val conf = new SparkConf()
// Serialization
// code found in TorrentBroadcast.(un)blockifyObject that is used by TorrentBroadcastFactory
val blockSize = 4 * 1024 * 1024 // 4Mb
val out = new ChunkedByteBufferOutputStream(blockSize, ByteBuffer.allocate)
val ser = new KryoSerializer(conf).newInstance() // Here I test using KryoSerializer, you can use JavaSerializer too
val serOut = ser.serializeStream(out)
Utils.tryWithSafeFinally { serOut.writeObject(data) } { serOut.close() }
// Deserialization
val blocks = out.toChunkedByteBuffer.getChunks()
val in = new SequenceInputStream(blocks.iterator.map(new ByteBufferInputStream(_)).asJavaEnumeration)
val serIn = ser.deserializeStream(in)
val data2 = Utils.tryWithSafeFinally { serIn.readObject[MyClass]() } { serIn.close() }
// run test on data2
data2.yo shouldBe data.yo
}
}
class MyClass(i: Int) extends Serializable {
@transient val yo = 1 to i // add lazy to make the test pass: not lazy transient val are not recomputed after deserialization
}