suppose I have a very big data.frame/collection with the field id and an proxy id.
1 a
2 a
1 b
3 b
1 c
3 c
4 d
Now I'd like to count/get the matches which id has with another id.
1 2 1 #id1 id2 count
1 3 2
Ok with some python itertools.combinations and lookups this works. But feels cumbersome. Is there an more approriate simple fast approach/technology?
My approach later appended:
I filtered the ids which are appear > x , beacuse I have millions.
def matchings(id):
#mapping is the mongodb collection
match = mapping.find({'id':id})
valid_proxies = [doc['proxy'] for doc in match]
other_ids = [doc['id'] for doc in mapping.find({'proxy': {'$in':valid_proxies}})]
c = Counter([(id, id2) for id2 in other_ids if id2 !=id])
#possible filter
#c_filtered {k:v for k, v in c.items() if v > 3 }
#some stats
#s1 = [id,len(proxies),len(other_ids)]
s2 = [[k[0],k[1],v] for k,v in c.items()]
return s2
res = [matchings(id) for id in list(df_id_filtered['id'])]
df_final_matching_counts = pd.DataFrame(list(itertools.chain(*res)))
Thanks!
答案 0 :(得分:1)
我将假设输入数据如下所示:
[{id: 1, p: 'a'}, {id: 2, p: 'a'}, {id: 1, p: 'b'}, {id: 3, p: 'b'}, {id: 1, p: 'c'}, {id: 3, p: 'c'}, {id: 4, p: 'd'}]
这是一个Cypher查询(使用neo4j 3.1进行测试,处于测试阶段)将返回预期的计数。 此示例使用data
指定输入WITH
,但在实践中,您将以其他方式获取数据(例如,通过MATCH
或通过参数)。< / em>的
WITH [{id: 1, p: 'a'}, {id: 2, p: 'a'}, {id: 1, p: 'b'}, {id: 3, p: 'b'}, {id: 1, p: 'c'}, {id: 3, p: 'c'}, {id: 4, p: 'd'}] AS data
UNWIND data AS d
WITH d.id AS id, COLLECT(d.p) AS ps
WITH COLLECT({id: id, ps: ps}) AS c
WITH
REDUCE(s=[], i IN RANGE(0, SIZE(c)-2) |
s + {id1: c[i].id, x:
REDUCE(t=[], j IN RANGE(i+1, SIZE(c)-1) |
t + {id2: c[j].id, ct:
REDUCE(u=0, k IN RANGE(0, SIZE(c[i].ps)-1) |
CASE WHEN (c[i].ps)[k] IN c[j].ps THEN u + 1 ELSE u END
)
}
)
}
) AS temp
UNWIND temp AS temp_row
UNWIND temp_row.x AS x
WITH temp_row.id1 AS id1, x.id2 AS id2, x.ct AS ct
WHERE ct > 0
RETURN id1, id2, ct;
查询逐步执行数据并计算每个id对共享相同代理(p
)值的次数。然后,只要计数超过0,它就会返回每对及其计数。
上述查询的结果是:
╒═══╤═══╤═══╕
│id1│id2│ct │
╞═══╪═══╪═══╡
│2 │1 │1 │
├───┼───┼───┤
│1 │3 │2 │
└───┴───┴───┘
答案 1 :(得分:1)
这是我的方法
df = pd.DataFrame({'id': {0: 1, 1: 2, 2: 1, 3: 3, 4: 1, 5: 3, 6: 4}, 'proxy': {0: 'a', 1: 'a', 2: 'b', 3: 'b', 4: 'c', 5: 'c', 6: 'd'}})
print df
分组依据汇总进入设置
df_g = df.groupby('id').apply(lambda x: set(x['proxy'])).to_frame()
df_g.columns = ['proxy']
制作笛卡儿积cartesian product in pandas
的技巧df_g['X'] = 'X'
merged = pd.merge(df_g.reset_index(),df_g.reset_index(), on = ['X'])
给出:
id_x proxy_x X id_y proxy_y
0 1 {a, c, b} X 1 {a, c, b}
1 1 {a, c, b} X 2 {a}
2 1 {a, c, b} X 3 {c, b}
3 1 {a, c, b} X 4 {d}
4 2 {a} X 1 {a, c, b}
5 2 {a} X 2 {a}
6 2 {a} X 3 {c, b}
7 2 {a} X 4 {d}
8 3 {c, b} X 1 {a, c, b}
9 3 {c, b} X 2 {a}
10 3 {c, b} X 3 {c, b}
11 3 {c, b} X 4 {d}
12 4 {d} X 1 {a, c, b}
13 4 {d} X 2 {a}
14 4 {d} X 3 {c, b}
15 4 {d} X 4 {d}
一些整理
# we dont care about (1,1) (2,2), (3,3) etc.
merged_filtered = merged[merged['id_x'] != merged['id_y'] ].copy(deep=True)
# use intersection on the sets, sorted list (or set) for the keys & len()
merged_filtered['intersect'] = merged_filtered.apply(lambda row: len(row['proxy_x'].intersection(row['proxy_y'])), axis=1)
# for us (1,2) = (2,1) etc. sorting or set, then drop duplicates will address that.
merged_filtered['keys'] = merged_filtered.apply(lambda row: sorted([row['id_x'],row['id_y']]),axis=1)
merged_filtered = merged_filtered[['keys','intersect']]
还有其他方法可以将列表对象列拆分为两列
merged_filtered['key1'] = merged_filtered['keys'].map(lambda x: x[0])
merged_filtered['key2'] = merged_filtered['keys'].map(lambda x: x[1])
merged_filtered.drop('keys', axis=1, inplace=True)
现在删除重复项
merged_filtered = merged_filtered.drop_duplicates().set_index(['key1','key2'])
print merged_filtered
intersect
key1 key2
1 2 1
3 2
4 0
2 3 0
4 0
3 4 0
如果你想删除零:
print merged_filtered[merged_filtered['intersect'] !=0]
intersect
key1 key2
1 2 1
3 2