我有下面的pandas系列,我需要选择其他行的超集。
系列:
1 [72197, 82086]
2 [72197, 82086, 194665]
3 [72230]
4 [72235, 72690, 121261]
5 [72235, 121261]
6 [72241]
7 [72251]
8 [72253, 83613]
9 [72253, 83613, 101294]
10 [72255, 122794]
11 [71962, 101646, 101663, 126351]
12 [71962, 101646, 101663, 126351, 141883]
13 [71962, 101646, 101663, 141883]
14 [72235]
输出系列:
1 [72197, 82086, 194665]
2 [72230]
3 [72235, 72690, 121261]
4 [72241]
5 [72251]
6 [72253, 83613, 101294]
7 [72255, 122794]
8 [71962, 101646, 101663, 126351, 141883]
答案 0 :(得分:1)
如果排序不重要,则可以使用经过稍微改动的this解决方案-将内部列表首先转换为set
,最后一次转换回:
s = pd.Series([[72197, 82086], [72197, 82086, 194665], [72230], [72235, 72690, 121261], [72235, 121261],
[72241], [72251], [72253, 83613], [72253, 83613], [72253, 83613, 101294], [72255, 122794],
[71962, 101646, 101663, 126351], [71962, 101646, 101663, 126351, 141883],
[71962, 101646, 101663, 141883], [72235]])
-
import functools,operator,collections
def is_power_of_two(n):
"""Returns True iff n is a power of two. Assumes n > 0."""
return (n & (n - 1)) == 0
def eliminate_subsets(sequence_of_sets):
"""Return a list of the elements of `sequence_of_sets`, removing all
elements that are subsets of other elements. Assumes that each
element is a set or frozenset and that no element is repeated."""
# The code below does not handle the case of a sequence containing
# only the empty set, so let's just handle all easy cases now.
if len(sequence_of_sets) <= 1:
return list(sequence_of_sets)
# We need an indexable sequence so that we can use a bitmap to
# represent each set.
if not isinstance(sequence_of_sets, collections.Sequence):
sequence_of_sets = list(sequence_of_sets)
# For each element, construct the list of all sets containing that
# element.
sets_containing_element = {}
for i, s in enumerate(sequence_of_sets):
for element in s:
try:
sets_containing_element[element] |= 1 << i
except KeyError:
sets_containing_element[element] = 1 << i
# For each set, if the intersection of all of the lists in which it is
# contained has length != 1, this set can be eliminated.
out = [s for s in sequence_of_sets
if s and is_power_of_two(functools.reduce(
operator.and_, (sets_containing_element[x] for x in s)))]
return list(map(list, out))
s = pd.Series(eliminate_subsets(list(map(set, s))))
print (s)
0 [194665, 72197, 82086]
1 [72230]
2 [72690, 72235, 121261]
3 [72241]
4 [72251]
5 [101294, 72253, 83613]
6 [122794, 72255]
7 [101646, 126351, 71962, 141883, 101663]
dtype: object
答案 1 :(得分:1)
您可以尝试以下方法:
df = pd.DataFrame({'ser': [[72197, 82086], [72197, 82086, 194665], [72230], [72235, 72690, 121261], [72235, 121261],
[72241], [72251], [72253, 83613], [72253, 83613], [72253, 83613, 101294], [72255, 122794],
[71962, 101646, 101663, 126351], [71962, 101646, 101663, 126351, 141883],
[71962, 101646, 101663, 141883], [72235]]})
df
ser
0 [72197, 82086]
1 [72197, 82086, 194665]
2 [72230]
3 [72235, 72690, 121261]
4 [72235, 121261]
5 [72241]
6 [72251]
7 [72253, 83613]
8 [72253, 83613]
9 [72253, 83613, 101294]
10 [72255, 122794]
11 [71962, 101646, 101663, 126351]
12 [71962, 101646, 101663, 126351, 141883]
13 [71962, 101646, 101663, 141883]
14 [72235]
supersets = []
for i, x in enumerate(df['ser']):
a = np.array([set(x).issuperset(set(row)) for row in df['ser']])
a = np.delete(a, i)
if any(a):
supersets.append(x)
print(supersets)
[[72197, 82086, 194665], [72235, 72690, 121261], [72235, 121261], [72253, 83613], [72253, 83613], [72253, 83613, 101294], [71962, 101646, 101663, 126351, 141883]]
使用序列表或数据框列表效率不高
答案 2 :(得分:0)
如果您使用普通列表,我想您可以使用set(a)m=[[72197, 82086] ,[72197, 82086, 194665],[72230],[72235, 72690, 121261],[72235, 121261],[72241],[72251],[72253, 83613],[72253, 83613, 101294],[72255, 122794]]
del_list_idx=[]
for i in range(0,len(m)-1):
for j in range(0,len(m)-1):
if set(m[i])<set(m[j]):
del_list_idx.append(i)
for i in range (0,len(del_list_idx)):
del m[del_list_idx[i]-i]
for i in range(0, len(m)):
print i, m[i]