我们可以调用JavaSparkContext.wholeTextFiles
并获取JavaPairRDD<String, String>
,其中第一个String是文件名,第二个String是整个文件内容。在Dataset API中是否有类似的方法,或者我所能做的就是将文件加载到JavaPairRDD
然后转换为Dataset(这是有效的,但我正在寻找非RDD解决方案)。
答案 0 :(得分:2)
如果您想使用数据集API,则可以使用//C# Code
//Structure 1
[StructLayout(LayoutKind.Sequential)]
unsafe public struct tDevReturn
{
public tDevReturn(int param)
{
iLogicCode = 0;
iPhyCode = 0;
iHandle = 0;
iType = 0;
acDevReturn = new char[128];
acReserve = new char[128];
}
public int iLogicCode;
public int iPhyCode;
public int iHandle;
public int iType;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 128)]
public char[] acDevReturn;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 128)]
public char[] acReserve;
}
//Structure 2
[StructLayout(LayoutKind.Sequential)]
unsafe public struct tCashBox
{
public tCashBox(int param)
{
acCurrency = new char[4];
lDenomination = 0;
iRemainCount = 0;
iCount = 0;
iOutCount = 0;
iRejectCount = 0;
iPurgeCount = 0;
byHopper = 0;
cStatus = '\0';
cLastStatus = '\0';
acBoxID = new char[6];
byBoxType = 0;
acReserved1 = new char[10];
acReserved2 = new char[10];
iReserverd1 = 0;
iReserverd2 = 0;
}
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 4)]
public char[] acCurrency;
public long lDenomination;
public int iRemainCount;
public int iCount;
public int iOutCount;
public int iRejectCount;
public int iPurgeCount;
public byte byHopper;
public char cStatus;
public char cLastStatus;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 6)]
public char[] acBoxID;
public byte byBoxType;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 10)]
public char[] acReserved1;
[MarshalAs(UnmanagedType.ByValArray, SizeConst = 10)]
public char[] acReserved2;
public int iReserverd1;
public int iReserverd2;
}
//API Import Declaration
[DllImport("xxxxxxxxxx.dll", CallingConvention = CallingConvention.StdCall)]
public static extern int CDM_iGetCassette([In, Out] tCashBox[] p_psCashBox, [In, Out] tDevReturn[] p_psStatus);
//API Call - but filling up the structure array with random values and incorrect sequence
tDevReturn[] response = new tDevReturn[8];
tCashBox[] cashboxData = new tCashBox[8];
int ret = Wrapper.CDM_iGetCassette(cashboxData, response);
。请查看here以获取API详细信息。请注意,使用spark.read.text("path/to/files/")
方法会返回Dataframe,其中&#34; 文本文件中的每一行都是生成的DataFrame中的新行&#34;。因此text()
方法将提供文件内容。要获取文件名,您必须使用text()
函数。
input_file_name()
如果要连接同一文件中的行,使其与整个文件内容类似,则需要在{name}列上使用import static org.apache.spark.sql.functions.input_file_name;
Dataset<Row> ds = spark.read().text("c:\\temp").withColumnRenamed("value", "content").withColumn("fileName", input_file_name());
ds.show(false);
函数,并使用groupBy
和concat_ws
函数。
collect_list