(2月14日编辑)
假设我有一个带有以下模式的Spark(PySpark)数据框:
root
|-- myarray: array (nullable = true)
| |-- element: string (containsNull = true)
|-- myindices: array (nullable = true)
| |-- element: integer (containsNull = true)
看起来像:
+--------------------+----------+
| myarray | myindices|
+--------------------+----------+
| [A]| [0] |
| [B, C]| [1] |
| [D, E, F, G]| [0,2] |
+--------------------+----------+
如何使用第二个数组来索引第一个?
我的目标是创建一个新的数据框,如下所示:
+--------------------+----------+------+
| myarray | myindices|result|
+--------------------+----------+------+
| [A]| [0] | [A] |
| [B, C]| [1] | [C] |
| [D, E, F, G]| [0,2] | [D,F]|
+--------------------+----------+------+
(可以安全地假设myindices
的内容始终保证在相关行的myarray
的基数范围内,因此没有出界限问题。)
.getItem()
方法似乎只适用于单个参数,所以我可能需要一个UDF,但我知道无法创建一个具有多个列作为输入的UDF。有或没有UDF的任何解决方案?
答案 0 :(得分:3)
//todo error checks/cleanup
HRESULT hr;
ICLRMetaHost *pMetaHost = NULL;
ICLRRuntimeInfo *pRuntimeInfo = NULL;
ICorRuntimeHost *pCorRuntimeHost = NULL;
IUnknownPtr spAppDomainThunk = NULL;
_AppDomainPtr spDefaultAppDomain = NULL;
bstr_t bstrAssemblyName(L"");
_AssemblyPtr spAssembly = NULL;
bstr_t bstrClassName(L"");
_TypePtr spType = NULL;
variant_t vtEmpty;
bstr_t bstrStaticMethodName(L"Main");
variant_t vtLengthRet;
hr = CLRCreateInstance(CLSID_CLRMetaHost, IID_PPV_ARGS(&pMetaHost));
const wchar_t* pszVersion = L"v2.0.50727";
hr = pMetaHost->GetRuntime(pszVersion, IID_PPV_ARGS(&pRuntimeInfo));
BOOL fLoadable;
hr = pRuntimeInfo->IsLoadable(&fLoadable);
if (!fLoadable) { wprintf(L".NET runtime %s cannot be loaded\n", pszVersion); return; }
hr = pRuntimeInfo->GetInterface(CLSID_CorRuntimeHost, IID_PPV_ARGS(&pCorRuntimeHost));
hr = pCorRuntimeHost->Start();
hr = pCorRuntimeHost->GetDefaultDomain(&spAppDomainThunk);
hr = spAppDomainThunk->QueryInterface(IID_PPV_ARGS(&spDefaultAppDomain));
SAFEARRAYBOUND bounds[1];
bounds[0].cElements = array_len;
bounds[0].lLbound = 0;
SAFEARRAY* arr = SafeArrayCreate(VT_UI1, 1, bounds);
SafeArrayLock(arr);
memcpy(arr->pvData, bytearray, array_len);
SafeArrayUnlock(arr);
hr = spDefaultAppDomain->Load_3(arr, &spAssembly);
hr = spAssembly->GetType_2(bstrClassName, &spType);
hr = spType->InvokeMember_3(bstrStaticMethodName, static_cast<BindingFlags>(BindingFlags_InvokeMethod | BindingFlags_Static | BindingFlags_Public), NULL, vtEmpty, nullptr, &vtLengthRet);
SafeArrayDestroy(arr);