我有1106列和6689行要填充稀疏矩阵,通过使用csv库中的DictReader读取csv文件来填充。 有一些不好的列,所以我想删除它们(目前正在尝试识别哪些是坏列)。
无论如何这里是代码:
def read_annotations(filehandle):
with open(filehandle, 'r', encoding = 'utf-8') as csvfile:
reader = csv.DictReader(csvfile)
workers = defaultdict(list)
ids = []
for row in reader:
# CF annotators given a choice to choose link on their request, We treat it as neither (so do expert raters
if row['which_form_of_hate_speech_is_this'].lower() == 'link':
row['which_form_of_hate_speech_is_this'] = 'neither'
# Create a dictionary containing each annotation by a worker,
# save their trust score, their annotation, the id of the tweet and the tweet in a tuple
workers[row['_worker_id']].append((row['_trust'], row['which_form_of_hate_speech_is_this'], row['id'], row['tweet']))
return workers, set(ids)
def annotation_writer(workers, ids):
""" Writes a annotations to file where each column is a worker and each row is a tweet ID
:param workers: Dictionary containing worker ID as key, trust, annotation, tweet id and tweet as tuple
"""
# Create empty dataframe with tweet IDS as indexes and workers as columns
df = pd.DataFrame(index = ids, columns= list(workers.keys()))
# Go through each worker
bad_annotators = [683, 691, 694, 691]
for i, w_idx in enumerate(workers):
# Go through each list item
if i <= 693:
if i in bad_annotators:
print(w_idx)
continue
for item in workers[w_idx]:
_, label, idx, tweet = item
# Set the label they have chosen for the item at index: TweetID and row worker_id
df.set_value(idx, w_idx, label)
return df
按累计时间排序的分析输出:
tottime percall cumtime percall filename:lineno(function)
466/1 0.020 0.000 361.865 361.865 {built-in method builtins.exec}
1 0.000 0.000 361.865 361.865 annotator.py:1(<module>)
1 93.955 93.955 360.789 360.789 annotator.py:24(annotation_writer)
20923 0.143 0.000 266.759 0.013 frame.py:1840(set_value)
6594 0.088 0.000 265.032 0.040 indexing.py:126(__setitem__)
6594 0.478 0.000 263.543 0.040 indexing.py:224(_setitem_with_indexer)
6594 0.077 0.000 254.250 0.039 frame.py:2743(reindex_axis)
6594 0.071 0.000 254.173 0.039 generic.py:2307(reindex_axis)
6594 0.144 0.000 250.682 0.038 generic.py:2320(_reindex_with_indexers)
6594 0.095 0.000 250.327 0.038 internals.py:3560(reindex_indexer)
6594 0.029 0.000 249.124 0.038 internals.py:3595(<listcomp>)
6594 0.155 0.000 249.096 0.038 internals.py:975(take_nd)
6594 0.239 0.000 248.633 0.038 algorithms.py:840(take_nd)
34098 140.266 0.004 140.266 0.004 {built-in method numpy.core.multiarray.empty}
6594 107.927 0.016 107.927 0.016 {pandas.algos.take_2d_axis1_object_object}
40942 0.136 0.000 4.503 0.000 base.py:1915(get_loc)
54130 4.213 0.000 4.242 0.000 {method 'get_loc' of 'pandas.index.IndexEngine' objects}
6594 0.052 0.000 3.299 0.001 base.py:2295(reindex)
6594 0.079 0.000 3.099 0.000 base.py:2028(get_indexer)
6594 2.871 0.000 2.923 0.000 {method 'get_indexer' of 'pandas.index.IndexEngine' objects}
6594 0.075 0.000 2.321 0.000 base.py:3011(insert)
13189 0.112 0.000 1.707 0.000 internals.py:2578(__init__)
19787/13191 0.368 0.000 1.620 0.000 base.py:124(__new__)
6594 0.027 0.000 1.519 0.000 internals.py:2916(setitem)
6594 0.108 0.000 1.492 0.000 internals.py:2811(apply)
20923 0.084 0.000 1.434 0.000 generic.py:1345(_get_item_cache)
6594 0.034 0.000 1.359 0.000 indexing.py:101(_get_setitem_indexer)
6594 0.048 0.000 1.281 0.000 indexing.py:163(_convert_tuple)
13188 0.105 0.000 1.229 0.000 indexing.py:1102(_convert_to_indexer)
13189 0.456 0.000 1.155 0.000 internals.py:2674(_rebuild_blknos_and_blklocs)
6594 0.043 0.000 1.008 0.000 base.py:522(_coerce_scalar_to_index)
13192 0.869 0.000 0.895 0.000 {pandas.lib.infer_dtype}
6594 0.031 0.000 0.761 0.000 base.py:354(_shallow_copy_with_infer)
6594 0.130 0.000 0.720 0.000 internals.py:628(setitem)
7283 0.082 0.000 0.694 0.000 internals.py:3283(get)
20473 0.167 0.000 0.669 0.000 internals.py:2482(make_block)
13188 0.020 0.000 0.639 0.000 base.py:950(is_integer)
6595 0.010 0.000 0.619 0.000 base.py:1172(inferred_type)
571/2 0.004 0.000 0.610 0.305 <frozen importlib._bootstrap>:966(_find_and_load)
571/2 0.003 0.000 0.610 0.305 <frozen importlib._bootstrap>:939(_find_and_load_unlocked)
452/2 0.003 0.000 0.610 0.305 <frozen importlib._bootstrap>:659(_load_unlocked)
387/2 0.002 0.000 0.610 0.305 <frozen importlib._bootstrap_external>:656(exec_module)
599/2 0.001 0.000 0.609 0.305 <frozen importlib._bootstrap>:214(_call_with_frames_removed)
2 0.000 0.000 0.609 0.304 __init__.py:5(<module>)
989631/989629 0.370 0.000 0.603 0.000 {built-in method builtins.isinstance}
7283 0.047 0.000 0.593 0.000 frame.py:2331(_box_item_values)
7283 0.031 0.000 0.480 0.000 frame.py:2338(_box_col_values)
1 0.068 0.068 0.465 0.465 annotator.py:7(read_annotations)
13877 0.069 0.000 0.463 0.000 internals.py:183(make_block_same_class)
7283 0.038 0.000 0.449 0.000 series.py:236(from_array)
432/41 0.001 0.000 0.440 0.011 {built-in method builtins.__import__}
7283 0.133 0.000 0.438 0.000 internals.py:3312(iget)
31512 0.141 0.000 0.389 0.000 csv.py:106(__next__)
20472 0.176 0.000 0.381 0.000 internals.py:1657(__init__)
7284 0.054 0.000 0.369 0.000 series.py:120(__init__)
70099/69243 0.051 0.000 0.367 0.000 <frozen importlib._bootstrap>:996(_handle_fromlist)
6594 0.350 0.000 0.350 0.000 {built-in method numpy.core.multiarray.concatenate}
19787 0.116 0.000 0.329 0.000 common.py:1380(_asarray_tuplesafe)
6594 0.032 0.000 0.310 0.000 internals.py:171(make_block)
533564/425149 0.221 0.000 0.290 0.000 {built-in method builtins.len}
32973 0.122 0.000 0.282 0.000 internals.py:2619(shape)
232315 0.230 0.000 0.270 0.000 {built-in method builtins.getattr}
41781 0.055 0.000 0.267 0.000 common.py:1710(is_datetimetz)
54821 0.182 0.000 0.260 0.000 generic.py:2674(__setattr__)
40673 0.061 0.000 0.251 0.000 numeric.py:414(asarray)
58149 0.225 0.000 0.238 0.000 {built-in method numpy.core.multiarray.array}
126058 0.096 0.000 0.231 0.000 generic.py:7(_check)
103351 0.124 0.000 0.230 0.000 dtypes.py:74(is_dtype)
61570 0.068 0.000 0.227 0.000 common.py:1736(is_categorical_dtype)
278263/278240 0.222 0.000 0.223 0.000 {built-in method builtins.hasattr}
31532 0.214 0.000 0.222 0.000 {built-in method builtins.next}
13188 0.043 0.000 0.220 0.000 indexing.py:183(_convert_scalar_indexer)
15213 0.030 0.000 0.208 0.000 {method 'any' of 'numpy.ndarray' objects}
52752 0.059 0.000 0.206 0.000 generic.py:333(_get_axis)
6595 0.054 0.000 0.201 0.000 internals.py:2799(_verify_integrity)
27 0.001 0.000 0.195 0.007 __init__.py:1(<module>)
20473 0.097 0.000 0.194 0.000 internals.py:77(__init__)
6595 0.060 0.000 0.194 0.000 frame.py:210(__init__)
19782 0.042 0.000 0.179 0.000 generic.py:2713(_protect_consolidate)
15213 0.015 0.000 0.178 0.000 _methods.py:37(_any)
6594 0.013 0.000 0.171 0.000 internals.py:555(_try_coerce_and_cast_result)
13188 0.046 0.000 0.168 0.000 generic.py:1407(_maybe_update_cacher)
1 0.000 0.000 0.167 0.167 api.py:5(<module>)
13189 0.069 0.000 0.164 0.000 internals.py:3004(_consolidate_check)
15213 0.163 0.000 0.163 0.000 {method 'reduce' of 'numpy.ufunc' objects}
98919 0.034 0.000 0.159 0.000 internals.py:2621(<genexpr>)
7284 0.032 0.000 0.156 0.000 series.py:270(_set_axis)
6594 0.029 0.000 0.156 0.000 internals.py:510(_try_cast_result)
1 0.000 0.000 0.155 0.155 groupby.py:1(<module>)
13188 0.023 0.000 0.150 0.000 generic.py:2723(_consolidate_inplace)
1 0.000 0.000 0.145 0.145 frame.py:10(<module>)
6594 0.047 0.000 0.141 0.000 common.py:527(_maybe_promote)
15402 0.024 0.000 0.138 0.000 common.py:1731(is_categorical)
13188 0.046 0.000 0.136 0.000 base.py:972(_convert_scalar_indexer)
13191 0.073 0.000 0.131 0.000 base.py:309(_simple_new)
59346 0.094 0.000 0.128 0.000 generic.py:320(_get_axis_name)
1 0.000 0.000 0.126 0.126 __init__.py:106(<module>)
2 0.000 0.000 0.125 0.063 __init__.py:9(<module>)
1514/1502 0.050 0.000 0.125 0.000 {built-in method builtins.__build_class__}
2 0.000 0.000 0.123 0.062 __init__.py:15(<module>)
54132 0.091 0.000 0.117 0.000 {pandas.lib.values_from_object}
6595 0.029 0.000 0.115 0.000 base.py:1507(equals)
41781 0.037 0.000 0.108 0.000 common.py:1575(is_datetime64tz_dtype)
6595 0.033 0.000 0.106 0.000 base.py:1180(is_all_dates)
52760 0.057 0.000 0.104 0.000 base.py:440(values)
1 0.000 0.000 0.104 0.104 series.py:3(<module>)
1 0.000 0.000 0.102 0.102 add_newdocs.py:10(<module>)
387 0.004 0.000 0.102 0.000 <frozen importlib._bootstrap_external>:726(get_code)
1 0.000 0.000 0.100 0.100 config_init.py:11(<module>)
107722 0.069 0.000 0.097 0.000 base.py:409(__len__)
13188 0.038 0.000 0.095 0.000 generic.py:2726(f)
6594 0.025 0.000 0.093 0.000 indexing.py:1860(maybe_convert_ix)
1 0.000 0.000 0.093 0.093 plotting.py:3(<module>)
1 0.000 0.000 0.090 0.090 converter.py:1(<module>)
1 0.000 0.000 0.089 0.089 format.py:2(<module>)
13189 0.022 0.000 0.089 0.000 internals.py:3005(<listcomp>)
27482 0.087 0.000 0.087 0.000 {method 'fill' of 'numpy.ndarray' objects}
1 0.000 0.000 0.087 0.087 type_check.py:3(<module>)
6596 0.085 0.000 0.085 0.000 {built-in method pandas.lib.list_to_object_array}
435/433 0.001 0.000 0.084 0.000 <frozen importlib._bootstrap>:570(module_from_spec)
20923 0.082 0.000 0.082 0.000 {method 'set_value' of 'pandas.index.IndexEngine' objects}
1 0.002 0.002 0.075 0.075 frame.py:307(_init_dict)
13190 0.075 0.000 0.075 0.000 {built-in method numpy.core.multiarray.arange}
6594 0.024 0.000 0.075 0.000 internals.py:1665(is_bool)
6594 0.057 0.000 0.074 0.000 algorithms.py:807(_get_take_nd_function)
13188 0.054 0.000 0.073 0.000 base.py:506(_get_attributes_dict)
764 0.002 0.000 0.072 0.000 re.py:278(_compile)
192 0.000 0.000 0.071 0.000 re.py:222(compile)
43/42 0.000 0.000 0.071 0.002 <frozen importlib._bootstrap_external>:900(create_module)
43/42 0.055 0.001 0.070 0.002 {built-in method _imp.create_dynamic}
167 0.001 0.000 0.070 0.000 sre_compile.py:531(compile)
387 0.001 0.000 0.070 0.000 <frozen importlib._bootstrap_external>:471(_compile_bytecode)
1 0.000 0.000 0.068 0.068 __init__.py:101(<module>)
387 0.067 0.000 0.067 0.000 {built-in method marshal.loads}
13188 0.028 0.000 0.067 0.000 common.py:1532(is_dtype_equal)
13189 0.060 0.000 0.066 0.000 internals.py:276(ftype)
540/539 0.006 0.000 0.066 0.000 <frozen importlib._bootstrap>:879(_find_spec)
20472 0.065 0.000 0.065 0.000 generic.py:2658(__getattr__)
1 0.000 0.000 0.064 0.064 frame.py:5224(_arrays_to_mgr)
6594 0.011 0.000 0.064 0.000 generic.py:2753(_is_mixed_type)
13189 0.038 0.000 0.063 0.000 internals.py:2579(<listcomp>)
13879 0.062 0.000 0.062 0.000 generic.py:94(__init__)
13191 0.019 0.000 0.057 0.000 common.py:1696(is_bool_dtype)
7283 0.009 0.000 0.057 0.000 common.py:73(isnull)
20923 0.017 0.000 0.057 0.000 series.py:366(_values)
2 0.000 0.000 0.056 0.028 common.py:1(<module>)
535 0.001 0.000 0.055 0.000 <frozen importlib._bootstrap_external>:1130(find_spec)
7284 0.036 0.000 0.055 0.000 internals.py:3778(__init__)
535 0.004 0.000 0.055 0.000 <frozen importlib._bootstrap_external>:1098(_get_spec)
53448 0.044 0.000 0.054 0.000 base.py:3381(_ensure_index)
20473 0.046 0.000 0.054 0.000 internals.py:191(mgr_locs)
59355 0.053 0.000 0.053 0.000 {method 'view' of 'numpy.ndarray' objects}
6594 0.036 0.000 0.052 0.000 common.py:733(_possibly_downcast_to_dtype)
990 0.011 0.000 0.052 0.000 <frozen importlib._bootstrap_external>:1212(find_spec)
6594 0.013 0.000 0.051 0.000 generic.py:1476(_check_is_chained_assignment_possible)
2 0.001 0.000 0.051 0.026 pyparsing.py:58(<module>)
1 0.000 0.000 0.051 0.051 rcsetup.py:15(<module>)
7283 0.022 0.000 0.049 0.000 generic.py:1359(_set_as_cached)
1 0.000 0.000 0.048 0.048 fontconfig_pattern.py:7(<module>)
26380 0.036 0.000 0.048 0.000 common.py:1511(_get_dtype_type)
7283 0.033 0.000 0.048 0.000 common.py:94(_isnull_new)
6594 0.029 0.000 0.048 0.000 base.py:2201(_possibly_promote)
7284 0.020 0.000 0.045 0.000 series.py:302(name)
1 0.002 0.002 0.045 0.045 frame.py:5521(_homogenize)
95791 0.044 0.000 0.044 0.000 {method 'get' of 'dict' objects}
19783 0.017 0.000 0.043 0.000 base.py:870(_values)
13188 0.014 0.000 0.043 0.000 indexing.py:1890(is_list_like_indexer)
1105 0.003 0.000 0.043 0.000 series.py:2787(_sanitize_array)
6594 0.042 0.000 0.042 0.000 {pandas.lib.is_bool_array}
169/167 0.001 0.000 0.042 0.000 sre_parse.py:819(parse)
26377 0.027 0.000 0.041 0.000 internals.py:3276(_consolidate_inplace)
6595 0.041 0.000 0.041 0.000 {pandas.lib.is_datetime_array}
19782 0.027 0.000 0.041 0.000 indexing.py:128(<genexpr>)
20923 0.026 0.000 0.040 0.000 internals.py:3911(internal_values)
194369/194289 0.038 0.000 0.040 0.000 {built-in method builtins.issubclass}
557/169 0.003 0.000 0.040 0.000 sre_parse.py:429(_parse_sub)
751/186 0.014 0.000 0.039 0.000 sre_parse.py:491(_parse)
26376 0.031 0.000 0.039 0.000 common.py:1491(_get_dtype)
6594 0.013 0.000 0.038 0.000 generic.py:1402(_is_view)
6 0.000 0.000 0.038 0.006 api.py:3(<module>)
1 0.000 0.000 0.038 0.038 __init__.py:27(<module>)
2 0.000 0.000 0.037 0.019 __init__.py:7(<module>)
1105 0.002 0.000 0.037 0.000 series.py:2804(_try_cast)
6595 0.019 0.000 0.037 0.000 common.py:272(array_equivalent)
6594 0.010 0.000 0.036 0.000 generic.py:2755(<lambda>)
13207 0.025 0.000 0.036 0.000 __init__.py:168(iteritems)
2 0.000 0.000 0.036 0.018 __init__.py:26(<module>)
6596 0.024 0.000 0.036 0.000 base.py:1143(_engine)
6594 0.014 0.000 0.035 0.000 generic.py:337(_get_block_manager_axis)
7284 0.022 0.000 0.033 0.000 series.py:306(name)
34350 0.032 0.000 0.032 0.000 {pandas.lib.isscalar}
6877 0.006 0.000 0.032 0.000 {built-in method builtins.all}
14294 0.022 0.000 0.031 0.000 common.py:1763(is_list_like)
13188 0.021 0.000 0.030 0.000 indexing.py:547(<genexpr>)
53445 0.030 0.000 0.030 0.000 internals.py:160(mgr_locs)
6596 0.010 0.000 0.030 0.000 base.py:1146(<lambda>)
2 0.000 0.000 0.029 0.014 __init__.py:2912(_call_aside)
1 0.000 0.000 0.029 0.029 __init__.py:2927(_initialize_master_working_set)
7283 0.026 0.000 0.028 0.000 base.py:1247(__getitem__)
1 0.000 0.000 0.028 0.028 generic.py:2(<module>)
6608 0.028 0.000 0.028 0.000 {built-in method builtins.sorted}
13188 0.019 0.000 0.028 0.000 generic.py:1441(_clear_item_cache)
1 0.000 0.000 0.027 0.027 requirements.py:4(<module>)
13190 0.009 0.000 0.026 0.000 base.py:445(get_values)
167 0.000 0.000 0.026 0.000 sre_compile.py:516(_code)
6594 0.012 0.000 0.026 0.000 internals.py:3009(is_mixed_type)
21161 0.026 0.000 0.026 0.000 internals.py:2695(_get_items)
6595 0.011 0.000 0.025 0.000 {built-in method builtins.sum}
6594 0.015 0.000 0.025 0.000 internals.py:3027(is_view)
13188 0.015 0.000 0.025 0.000 internals.py:3260(consolidate)
2 0.000 0.000 0.024 0.012 __init__.py:45(<module>)
39565 0.024 0.000 0.024 0.000 internals.py:2996(is_consolidated)
523 0.001 0.000 0.024 0.000 decorators.py:181(__call__)
7283 0.024 0.000 0.024 0.000 internals.py:301(iget)
545 0.004 0.000 0.023 0.000 textwrap.py:415(dedent)
6594 0.021 0.000 0.023 0.000 common.py:446(_infer_dtype_from_scalar)
2 0.000 0.000 0.023 0.011 index.py:2(<module>)
6594 0.020 0.000 0.022 0.000 generic.py:124(_init_mgr)
63023 0.022 0.000 0.022 0.000 csv.py:92(fieldnames)
6594 0.017 0.000 0.021 0.000 generic.py:307(_get_axis_number)
23 0.000 0.000 0.021 0.001 pyparsing.py:798(_trim_arity)
22 0.000 0.000 0.021 0.001 pyparsing.py:806(extract_stack)
22 0.000 0.000 0.021 0.001 traceback.py:192(extract_stack)
22 0.004 0.000 0.021 0.001 traceback.py:303(extract)
30 0.001 0.000 0.021 0.001 __init__.py:349(namedtuple)
1 0.000 0.000 0.021 0.021 internals.py:1(<module>)
6594 0.012 0.000 0.020 0.000 common.py:1550(is_integer_dtype)
1 0.000 0.000 0.020 0.020 parser.py:5(<module>)
13188 0.016 0.000 0.020 0.000 base.py:508(<listcomp>)
1320/167 0.006 0.000 0.019 0.000 sre_compile.py:64(_compile)
19 0.000 0.000 0.019 0.001 pyparsing.py:958(setParseAction)
1 0.000 0.000 0.019 0.019 feedparser.py:20(<module>)
1 0.000 0.000 0.019 0.019 internals.py:3996(create_block_manager_from_arrays)
19784 0.015 0.000 0.019 0.000 internals.py:2623(ndim)
1 0.000 0.000 0.019 0.019 dates.py:111(<module>)
1 0.003 0.003 0.019 0.019 internals.py:4007(form_blocks)
13188 0.019 0.000 0.019 0.000 internals.py:650(<lambda>)
13198 0.019 0.000 0.019 0.000 dtypes.py:122(construct_from_string)
7284 0.018 0.000 0.018 0.000 series.py:292(_set_subtyp)
13188 0.012 0.000 0.018 0.000 missing.py:559(clean_reindex_fill_method)
1105 0.005 0.000 0.018 0.000 common.py:1011(_possibly_cast_to_datetime)
88922 0.018 0.000 0.018 0.000 {method 'append' of 'list' objects}
3288 0.018 0.000 0.018 0.000 {built-in method posix.stat}
6594 0.005 0.000 0.017 0.000 base.py:3425(_ensure_has_len)
6594 0.012 0.000 0.017 0.000 internals.py:4301(_extend_blocks)
17/16 0.000 0.000 0.017 0.001 <frozen importlib._bootstrap>:630(_load_backward_compatible)
5 0.000 0.000 0.017 0.003 __init__.py:34(load_module)
1 0.000 0.000 0.017 0.017 message.py:5(<module>)
2634 0.002 0.000 0.016 0.000 <frozen importlib._bootstrap_external>:68(_path_stat)
14 0.000 0.000 0.016 0.001 __init__.py:663(add_entry)