Question

考虑这个清单

 my_data= [
     ['58', 'management', 'unknown', 'no'],
     ['44', 'technician', 'single', 'no'],
     ['33', 'entrepreneur', 'married', 'no'],
     ['47', 'blue-collar', 'married', 'no'],
     ['33', 'unknown', 'single', 'no'],
     ['35', 'management', 'unknown', 'no'],
     ['28', 'management', 'single', 'no'],
     ['42', 'entrepreneur', 'divorced', 'no'],
     ['58', 'retired', 'married', 'no'],
     ['43', 'technician', 'unknown', 'no']
]

我想用列表中最常见的元素替换这些未知值我的代码有问题，任何人都可以纠正它。看来我需要调用remove_unknown（）两次才能使它工作

def most_common(lst):
    return max(set(lst), key=lst.count)

def remove_unknowns(ls):
    mycols=[]
    for cols in range(0, 4):
        for rows in xrange(len(ls)):
            if (type(ls[rows][cols]) is str):
                mycols.extend([ls[rows][cols]])
        #print mycols
        for rows in xrange(len(ls)):
            if (type(ls[rows][cols]) is str):
                if ls[rows][cols] == 'unknown':
                    ls[rows][cols]=most_common(mycols)
    return ls
remove_unknowns(my_data)
remove_unknowns(my_data)

我想要的输出是这样的：

my_data= [
 ['58', 'management', 'married', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'management', 'single', 'no'],
 ['35', 'management', 'married', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'married', 'no']

Answer 1

如果最常见的意思是在所有组合的值中首先找到最常见的元素，然后迭代子列表，用最常用的单词替换任何Nones：

my_data= [['58', 'management', 'unknown', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'unknown', 'single', 'no'],
 ['35', 'management', 'unknown', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'unknown', 'no']]

from collections import Counter
from itertools import chain

cn = Counter(chain(*my_data)).most_common(1)[0][0]
for sub in my_data:
    sub[:] = [cn if s == "unknown" else s for s in sub ]

如果您确实希望获得最大列，则需要更多参与，您需要应用相同的计数器逻辑，但每列使用一个：

from collections import Counter

# range over amount of columns and get the count for all words
l = [Counter() for _ in range(4)]
for sub in my_data:
    for ind, ele in enumerate(sub):
        l[ind][ele] += 1

# get most common word from each Counter
l[:] = [c.most_common(1)[0][0] for c in l]

for sub in my_data:
   # if word is unknown replace it using the appropriate column word
   sub[:] = [ l[ind] if ele == "unknown" else ele for ind, ele in enumerate(sub)]




from pprint import pprint as pp
pp(my_data)

哪个会给你：

[['58', 'management', 'married', 'no'],
 ['44', 'technician', 'single', 'no'],
 ['33', 'entrepreneur', 'married', 'no'],
 ['47', 'blue-collar', 'married', 'no'],
 ['33', 'management', 'single', 'no'],
 ['35', 'management', 'married', 'no'],
 ['28', 'management', 'single', 'no'],
 ['42', 'entrepreneur', 'divorced', 'no'],
 ['58', 'retired', 'married', 'no'],
 ['43', 'technician', 'married', 'no']]

在第三列中，单个或已婚可能是替换值，因为它们看起来等量。

替换2D列表中的元素

1 个答案: