Python / Pandas匹配另一个子字符串中的子字符串

时间:2020-11-11 10:46:16

标签: python pandas dataframe substring

我被困在寻找存储在2个不同数据帧的2个不同子集中的公用密钥,然后输出第三列:

winrt::com_ptr<IMFTransform> Transform;
winrt::check_hresult(CoCreateInstance(CLSID_CResamplerMediaObject, nullptr, CLSCTX_ALL, IID_PPV_ARGS(Transform.put())));

WAVEFORMATEX InputWaveFormatEx { WAVE_FORMAT_PCM, 1, 44100, 44100 * 2, 2, 16 };
WAVEFORMATEX OutputWaveFormatEx { WAVE_FORMAT_PCM, 1, 48000, 48000 * 2, 2, 16 };

winrt::com_ptr<IMFMediaType> InputMediaType;
winrt::check_hresult(MFCreateMediaType(InputMediaType.put()));
winrt::check_hresult(MFInitMediaTypeFromWaveFormatEx(InputMediaType.get(), &InputWaveFormatEx, sizeof InputWaveFormatEx));
winrt::com_ptr<IMFMediaType> OutputMediaType;
winrt::check_hresult(MFCreateMediaType(OutputMediaType.put()));
winrt::check_hresult(MFInitMediaTypeFromWaveFormatEx(OutputMediaType.get(), &OutputWaveFormatEx, sizeof OutputWaveFormatEx));

winrt::check_hresult(Transform->SetInputType(0, InputMediaType.get(), 0));
winrt::check_hresult(Transform->SetOutputType(0, OutputMediaType.get(), 0));

MFT_OUTPUT_STREAM_INFO OutputStreamInfo { };
winrt::check_hresult(Transform->GetOutputStreamInfo(0, &OutputStreamInfo));
_A(!(OutputStreamInfo.dwFlags & MFT_OUTPUT_STREAM_SINGLE_SAMPLE_PER_BUFFER));

DWORD const InputMediaBufferSize = InputWaveFormatEx.nAvgBytesPerSec;
winrt::com_ptr<IMFMediaBuffer> InputMediaBuffer;
winrt::check_hresult(MFCreateMemoryBuffer(InputMediaBufferSize, InputMediaBuffer.put()));
winrt::check_hresult(InputMediaBuffer->SetCurrentLength(InputMediaBufferSize));
winrt::com_ptr<IMFSample> InputSample;
winrt::check_hresult(MFCreateSample(InputSample.put()));
winrt::check_hresult(InputSample->AddBuffer(InputMediaBuffer.get()));
winrt::check_hresult(Transform->ProcessInput(0, InputSample.get(), 0));

DWORD const OutputMediaBufferCapacity = OutputWaveFormatEx.nAvgBytesPerSec;
winrt::com_ptr<IMFMediaBuffer> OutputMediaBuffer;
winrt::check_hresult(MFCreateMemoryBuffer(OutputMediaBufferCapacity, OutputMediaBuffer.put()));
winrt::check_hresult(OutputMediaBuffer->SetCurrentLength(0));
winrt::com_ptr<IMFSample> OutputSample;
winrt::check_hresult(MFCreateSample(OutputSample.put()));
winrt::check_hresult(OutputSample->AddBuffer(OutputMediaBuffer.get()));
MFT_OUTPUT_DATA_BUFFER OutputDataBuffer { 0, OutputSample.get() };
DWORD Status;
winrt::check_hresult(Transform->ProcessOutput(0, 1, &OutputDataBuffer, &Status));

DWORD OutputMediaBufferSize = 0;
winrt::check_hresult(OutputMediaBuffer->GetCurrentLength(&OutputMediaBufferSize));

预期输出:

enter image description here

我已经完成研究……。的问题确实与此相似:How to merge pandas on string contains?。但是这里的键只有一个项目,我的示例在两个键中都有2个项目。

1 个答案:

答案 0 :(得分:0)

假设,您的代码始终被空格分隔。

您可以使用list comprehensions来检查Code1列中Code2列中每个代码的存在。通过检索匹配代码的索引,我们可以获得Dataframe,其中包含具有重叠代码的行。

然后,我们可以更新原始数据帧以获取预期的输出。

# Create a list of matching codes
list_of_matches = df1['Code1'].apply(lambda x: [
                         any([word in str(list_of_words).split() 
                              for word in str(x).split()]) 
                              for list_of_words in df2['code2']])

# Get the indices of matching codes
i, j = np.where(list_of_matches.values.tolist())

# Create a new dataframe with name and second name of rows with matching code
# And drop rows with NA, as they don't make sense
df3 = pd.DataFrame(np.column_stack([df1.loc[i], df2.loc[j]]), 
                   columns=df1.columns.append(df2.columns)).dropna()

# Create columns in your original dataframe to be able to update the dataframe
df1['Second Name'] = np.nan
df1['code2'] = np.nan

# Update dataframe with matching rows
df1.update(df3)

输出

    Name    Code1   Second Name   code2
0   John    AAA OO  Cohen         AAA GGG
1   Michael BBB UU  Smith         UU HHH
2   Dan     JJ      NaN           NaN
3   George  NaN     NaN           NaN
4   Adam    II      Kas           TT II