我正在尝试删除标点符号并计算 话。我的部分代码有效,但我的代码似乎不完整。
def word_distribution(text_string):
words_list = text_string.split()
words_list = [words_list[i].lower() for i in range(len(words_list))]
for i in range(len(words_list)):
if not words_list[i].isalpha():
word = words_list[i]
for j in word:
if j != "\'" and not j.isalpha():
id = word.find(j)
words_list[i] = word.replace(word[id],"")
words_dict = {}
for word in words_list:
if word in words_dict:
words_dict[word] += 1
else:
words_dict[word] = 1
result = words_dict
return result
word_distribution("Hello,, hello, hi, Hurray!!!, Hurray, What's up!,Today
is Saturday, Saturday, saturday. Funday.")
我正在尝试像{'hello':2,'hi':1}这样的单词的字典。它适用于问候词,但适用于Hurray,它给我的输出类似于hurray !!!':1,'hurray':1而不是Hurray:2。我不知道为什么会这样。
任何对为什么这样的行为表示赞赏的输入。
答案 0 :(得分:0)
您可以使用正则表达式用非字母字符分隔句子。并可以获得结果
import re
def word_distribution(s):
res = re.split(r'\W+', s)
counter = {}
for word in res:
if not counter.get(word):
counter[word] = 1
else:
counter[word] += 1
return counter
s = "Hello,, hello, hi, Hurray!!!, Hurray, What's up!,Today is Saturday, Saturday, saturday. Funday."
print(word_distribution(s))```
#Output:
#{'Hello': 1, 'hello': 1, 'hi': 1, 'Hurray': 2, 'What': 1, 's': 1, 'up': 1, 'Today': 1, 'is': 1, 'Saturday': 2, 'saturday': 1, 'Funday': 1, '': 1}
答案 1 :(得分:0)
import re
from collections import Counter
def word_distribution(text_string):
regex = re.compile('[^a-zA-Z]')
return Counter(regex.sub('', word.lower()) for word in text_string.split())
s = "Hello,, hello, hi, Hurray!!!, Hurray, What's up!,Today is Saturday, Saturday, saturday. Funday."
print(word_distribution(s)) #Counter({'saturday': 3, 'hello': 2, 'hurray': 2, 'whats': 1, 'funday': 1, 'is': 1, 'hi': 1, 'uptoday': 1})