我有两个数据帧mapp
和data
,如下所示
mapp = pd.DataFrame({'variable': ['d22','Studyid','noofsons','Level','d21'],'concept_id':[1,2,3,4,5]})
data = pd.DataFrame({'sourcevalue': ['d22heartabcd','Studyid','noofsons','Level','d21abcdef']})
我想从data
获取一个值,并检查它是否存在于mapp
中,如果是,则获取相应的concept_id
值。优先级是首先寻找exact match
。如果没有找到匹配项,请继续substring match
。由于我要处理超过百万条记录,因此任何标量解决方案都是有帮助的
s = mapp.set_index('variable')['concept_id']
data['concept_id'] = data['sourcevalue'].map(s)
产生如下输出
当我进行子字符串匹配时,有效记录也会变为NA,如下所示
data['concept_id'] = data['sourcevalue'].str[:3].map(s)
我不知道为什么现在向NA
提供有效记录
如何以一种优雅而有效的方式一次完成这两项检查?
我希望我的输出如下所示
答案 0 :(得分:2)
如果需要按字符串映射,并且前3个字母创建两个单独的序列,然后使用Series.fillna
或Series.combine_first
用a
替换b
中的缺失值:
s = mapp.set_index('variable')['concept_id']
a = data['sourcevalue'].map(s)
b = data['sourcevalue'].str[:3].map(s)
data['concept_id'] = a.fillna(b)
#alternative
#data['concept_id'] = a.combine_first(b)
print (data)
sourcevalue concept_id
0 d22heartabcd 1.0
1 Studyid 2.0
2 noofsons 3.0
3 Level 4.0
4 d21abcdef 5.0
编辑:
#all strings map Series
s = mapp.set_index('variable')['concept_id']
print (s)
variable
d22 1
Studyid 2
noofsons 3
Level 4
d21 5
Name: concept_id, dtype: int64
#first 3 letters map Series
s1 = mapp.assign(variable = mapp['variable'].str[:3]).set_index('variable')['concept_id']
print (s1)
variable
d22 1
Stu 2
noo 3
Lev 4
d21 5
Name: concept_id, dtype: int64
#first 3 letters map by all strings
print (data['sourcevalue'].str[:3].map(s))
0 1.0
1 NaN
2 NaN
3 NaN
4 5.0
Name: sourcevalue, dtype: float64
#first 3 letters match by 3 first letters map Series
print (data['sourcevalue'].str[:3].map(s1))
0 1
1 2
2 3
3 4
4 5
Name: sourcevalue, dtype: int64
答案 1 :(得分:2)
使用我编写的fuzzy_merge
函数:
new = fuzzy_merge(data, mapp, 'sourcevalue', 'variable')\
.merge(mapp, left_on='matches', right_on='variable')\
.drop(columns=['matches', 'variable'])
输出
sourcevalue concept_id
0 d22heartabcd 1
1 Studyid 2
2 noofsons 3
3 Level 4
4 d21abcdef 5
链接答案中使用的功能:
def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
"""
df_1 is the left table to join
df_2 is the right table to join
key1 is the key column of the left table
key2 is the key column of the right table
threshold is how close the matches should be to return a match
limit is the amount of matches will get returned, these are sorted high to low
"""
s = df_2[key2].tolist()
m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
df_1['matches'] = m
m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
df_1['matches'] = m2
return df_1
答案 2 :(得分:2)
尝试一下。在这种情况下,我们将在第一个映射之后定位NA值并对其进行子字符串映射
//self.video!.folderURL.path = /var/mobile/Containers/Data/Application/066BDB03-FD47-48D8-B6F8-932AFB174DF7/Documents/AV/769c504203024bae95b47d78d8fe9029
try? fileManager.createDirectory(atPath: self.video!.folderURL.path, withIntermediateDirectories: true, attributes: nil)
// Update permission for AV folder
do {
let parentDirectoryPath = self.video!.folderURL.deletingLastPathComponent().path
let result = try fileManager.setAttributes([FileAttributeKey.posixPermissions: 0o777], ofItemAtPath: parentDirectoryPath)
print(result)
} catch {
print("Error = \(error)")
}
// sourceURL = file:///private/var/mobile/Containers/Data/PluginKitPlugin/50D8B8DB-19D3-4DD6-93DD-55F37CF87EA7/tmp/trim.6D37DFD2-5F27-4AB9-B478-5FED8AA6ABD7.MOV
// self.url = file:///var/mobile/Containers/Data/Application/066BDB03-FD47-48D8-B6F8-932AFB174DF7/Documents/AV/0b0b6803e891780850152eeab450a2ae.mov
do {
try fileManager.moveItem(at: sourceURL, to: self.url)
} catch {
QLogTools.logError("Error moving video clip file \(error)")
}