我实际上有这个问题的工作代码,但它很麻烦,可能是一个愚蠢的方法(这里是python初学者),所以我希望有人可以提供更好的解决方案。
我有一个包含几千个文本段落的数据库。对于每一个,我计算了平均单词长度和分数。我想将单词长度分成多个单元格(本例中为0.25步),并获得每个bin中高于某个阈值的文本片段的百分比,然后绘制整个图形。
这是我现在拥有的代码。这项任务明显愚蠢很长,如果我想改变箱子的大小,我必须重写很多行。什么是更好的方法呢?
WL40 = []
WL42 = []
WL45 = []
WL47 = []
WL50 = []
WL52 = []
WL55 = []
WL57 = []
WL60 = []
WL62 = []
WL65 = []
WL67 = []
WL70 = []
WL72 = []
ScoreTarget = 100 #This is the threshold
for line in text_table:
if line.TextWordCount>20 and line.AverageWordLength>3.5 and line.AverageWordLength<7.25:
if line.AverageWordLength<=4:
WL40.append(0) if line.Score<ScoreTarget else WL40.append(1)
if line.AverageWordLength>4 and line.AverageWordLength<=4.25:
WL42.append(0) if line.Score<ScoreTarget else WL42.append(1)
if line.AverageWordLength>4.25 and line.AverageWordLength<=4.5:
WL45.append(0) if line.Score<ScoreTarget else WL45.append(1)
if line.AverageWordLength>4.5 and line.AverageWordLength<=4.75:
WL47.append(0) if line.Score<ScoreTarget else WL47.append(1)
if line.AverageWordLength>4.75 and line.AverageWordLength<=5:
WL50.append(0) if line.Score<ScoreTarget else WL50.append(1)
if line.AverageWordLength>5 and line.AverageWordLength<=5.25:
WL52.append(0) if line.Score<ScoreTarget else WL52.append(1)
if line.AverageWordLength>5.25 and line.AverageWordLength<=5.5:
WL55.append(0) if line.Score<ScoreTarget else WL55.append(1)
if line.AverageWordLength>5.5 and line.AverageWordLength<=5.75:
WL57.append(0) if line.Score<ScoreTarget else WL57.append(1)
if line.AverageWordLength>5.75 and line.AverageWordLength<=6:
WL60.append(0) if line.Score<ScoreTarget else WL60.append(1)
if line.AverageWordLength>6 and line.AverageWordLength<=6.25:
WL62.append(0) if line.Score<ScoreTarget else WL62.append(1)
if line.AverageWordLength>6.25 and line.AverageWordLength<=6.5:
WL65.append(0) if line.Score<ScoreTarget else WL65.append(1)
if line.AverageWordLength>6.5 and line.AverageWordLength<=6.75:
WL67.append(0) if line.Score<ScoreTarget else WL67.append(1)
if line.AverageWordLength>6.75 and line.AverageWordLength<=7:
WL70.append(0) if line.Score<ScoreTarget else WL70.append(1)
if line.AverageWordLength>7 and line.AverageWordLength<=7.25:
WL72.append(0) if line.Score<ScoreTarget else WL72.append(1)
Ychance = []
Ychance.append(np.mean(WL40))
Ychance.append(np.mean(WL42))
Ychance.append(np.mean(WL45))
Ychance.append(np.mean(WL47))
Ychance.append(np.mean(WL50))
Ychance.append(np.mean(WL52))
Ychance.append(np.mean(WL55))
Ychance.append(np.mean(WL57))
Ychance.append(np.mean(WL60))
Ychance.append(np.mean(WL62))
Ychance.append(np.mean(WL65))
Ychance.append(np.mean(WL67))
Ychance.append(np.mean(WL70))
Ychance.append(np.mean(WL72))
Xchance = [4,
4.25,
4.5,
4.75,
5,
5.25,
5.5,
5.75,
6,
6.25,
6.5,
6.75,
7,
7.25]
plt.scatter(Xchance, Ychance)
plt.show
答案 0 :(得分:0)
range_list = [4, 4.25, 4.5, 4.75, 5, 5.25, 5.5, 5.75, 6, 6.25, 6.5, 6.75, 7, 7.25]
Xchance = range_list
Ychance = []
dct_map = {}
ScoreTarget = 100
for line in text_table:
if line.TextWordCount>20 and line.AverageWordLength>3.5 and line.AverageWordLength<7.25:
for range_val in range_list:
if line.AverageWordLength <= range_val:
key = "WL"+str(int(range_val*10))
if key not in dct_map:
dct_map[key] = []
dct_map[key].append(0) if line.Score<ScoreTarget else dct_map[key].append(1)
break
for k in sorted(dct_map):
Ychance.append(np.mean(dct_map[k]))
plt.scatter(Xchance, Ychance)
plt.show
使用此代码。试着让我知道它有效吗?