替换2D列表中的元素

时间:2015-12-30 18:44:39

标签: python

考虑这个清单

 my_data= [
     ['58', 'management', 'unknown', 'no'],
     ['44', 'technician', 'single', 'no'],
     ['33', 'entrepreneur', 'married', 'no'],
     ['47', 'blue-collar', 'married', 'no'],
     ['33', 'unknown', 'single', 'no'],
     ['35', 'management', 'unknown', 'no'],
     ['28', 'management', 'single', 'no'],
     ['42', 'entrepreneur', 'divorced', 'no'],
     ['58', 'retired', 'married', 'no'],
     ['43', 'technician', 'unknown', 'no']
]

我想用列表中最常见的元素替换这些未知值 我的代码有问题,任何人都可以纠正它。 看来我需要调用remove_unknown()两次才能使它工作

def most_common(lst):
    return max(set(lst), key=lst.count)

def remove_unknowns(ls):
    mycols=[]
    for cols in range(0, 4):
        for rows in xrange(len(ls)):
            if (type(ls[rows][cols]) is str):
                mycols.extend([ls[rows][cols]])
        #print mycols
        for rows in xrange(len(ls)):
            if (type(ls[rows][cols]) is str):
                if ls[rows][cols] == 'unknown':
                    ls[rows][cols]=most_common(mycols)
    return ls
remove_unknowns(my_data)
remove_unknowns(my_data)

我想要的输出是这样的:

my_data= [
 ['58', 'management', 'married', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'management', 'single', 'no'],
 ['35', 'management', 'married', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'married', 'no']

1 个答案:

答案 0 :(得分:1)

如果最常见的意思是在所有组合的值中首先找到最常见的元素,然后迭代子列表,用最常用的单词替换任何Nones:

my_data= [['58', 'management', 'unknown', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'unknown', 'single', 'no'],
 ['35', 'management', 'unknown', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'unknown', 'no']]

from collections import Counter
from itertools import chain

cn = Counter(chain(*my_data)).most_common(1)[0][0]
for sub in my_data:
    sub[:] = [cn if s == "unknown" else s for s in sub ]

如果您确实希望获得最大列,则需要更多参与,您需要应用相同的计数器逻辑,但每列使用一个:

from collections import Counter

# range over amount of columns and get the count for all words
l = [Counter() for _ in range(4)]
for sub in my_data:
    for ind, ele in enumerate(sub):
        l[ind][ele] += 1

# get most common word from each Counter
l[:] = [c.most_common(1)[0][0] for c in l]

for sub in my_data:
   # if word is unknown replace it using the appropriate column word
   sub[:] = [ l[ind] if ele == "unknown" else ele for ind, ele in enumerate(sub)]




from pprint import pprint as pp
pp(my_data)

哪个会给你:

[['58', 'management', 'married', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'management', 'single', 'no'],
 ['35', 'management', 'married', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'married', 'no']]

在第三列中,单个或已婚可能是替换值,因为它们看起来等量。