我有一个文件,我想找到最常用的10个单词。我省略了停用词和标点符号,然后将结果放入列表中。每行包含一个波斯语句子,一个标签,然后是一个英文单词。问题是,下面的代码返回每行的一个单词。例如,如果行数是12,则返回12个单词。我认为缩进有问题。我该如何解决?
.
.
.
def train ():
RemStopWords (file1, file2) # the function for removing stop words and punctuation at the start of the code
for line in witoutStops:
line = line.strip().split("\t")
words = line[0].split()
uniques = []
q = []
for word in words:
if word not in uniques:
uniques.append(word)
counts = []
for unique in uniques:
count = 0
for word in words:
if word == unique:
count += 1
counts.append((count, unique))
counts.sort()
counts.reverse()
for i in range(min(10, len(counts))):
count, word = counts[i]
print('%s %d' % (word, count))
#q.append(word)
#print (q)
答案 0 :(得分:2)
您可以使用collections.Counter
:
|0%
答案 1 :(得分:0)
编辑:路易斯安那州哈塞克的答案是一种更简单,更优雅的方式,并且具有相同的输出,所以你应该明确地检查一下!
有一种更简单的方法:)
import operator # we will use this later for sorting dictionaries
def train():
# assuming this returns the string of the text
textWithoutStops = RemStopWords(file1, file2)
# dictionary were words are keys and number of time they appear are values
wordCount = {}
for word in textWithoutStops.split(' '): # convert string to list, using spaces as separators
if not word in wordCount:
wordCount[word] = 1
else:
wordCount[word] += 1
# we sort from less to more frequency
sortedWordCount = sorted(wordCount.items(), key=operator.itemgetter(1))
# and reverse the list so it's from more to less frequent
sortedWordCount = sortedWordCount[::-1]
# we take only the first 10, if it has more than 10
if len(sortedWordCount) > 10:
sortedWordCount = sortedWordCount[:10]
# Here we go, a list containing tuples with the structure: (word, count)
return sortedWordCount
例如,如果文件包含您的问题
我有一个文件,我想找到最常用的10个单词。一世 省略了停用词和标点符号,然后将结果放入列表中。 每行包含一个波斯语句子,一个标签,然后是一个英文单词。 问题是,下面的代码返回每行的一个单词。对于 例如,如果行数为12,则返回12个单词。我觉得 缩进有问题。我该如何解决?
输出将是:
[('the', 5), ('I', 4), ('a', 4), ('and', 4), ('in', 2), ('of', 2), ('then', 2), ('returns', 2), ('words', 2), ('fix', 1)]
注意:要打开文本文件并将其所有内容转换为字符串,您可以(并且可能已经执行)以下操作:
with open(file, 'r') as f:
text = f.read()
希望这会对你有帮助!
答案 2 :(得分:0)
基于C ++的解决方案,使用优先级队列,映射和特里 这是使用优先级队列,映射和特里的类似c ++代码。为简单起见,可以从向量字符串中读取内容,但可以轻松修改以从文件中读取单词。
使用C ++查找文件或流中的前K个常见单词 这是priority_queue的可行解决方案,供您参考。
#include <iostream>
#include <vector>
#include <queue>
#include <unordered_map>
using namespace std;
#define K_TH 3
class TrieNode;
typedef struct HeapNode
{
string word;
int frequency;
HeapNode(): frequency(0), word(""){} ;
TrieNode *trieNode;
}HeapNode;
class TrieNode
{
private:
int frequency = 0;
bool m_isLeaf = false;
string word = "";
unordered_map<char, TrieNode*> children;
HeapNode *heapNode = NULL;
public:
TrieNode() {}
TrieNode(char c)
{
children[c] = new TrieNode();
this->m_isLeaf = false;
}
void setWord(string word)
{
this->word = word;
}
string getWord()
{
return this->word;
}
bool isLeaf(void)
{
return this->m_isLeaf;
}
void setLeaf(bool leaf)
{
this->m_isLeaf = leaf;
}
TrieNode* getChild(char c)
{
if (children[c] != NULL)
return children[c];
return NULL;
}
void insert(char c)
{
children[c] = new TrieNode();
}
int getFrequency()
{
return this->frequency;
}
void setFrequency(int frequency)
{
this->frequency = frequency;
}
void setHeapNode(HeapNode *heapNode)
{
this->heapNode = heapNode;
}
HeapNode* getHeapNode()
{
return heapNode;
}
bool operator()(HeapNode* &a, HeapNode* &b)
{
return (a->frequency > b->frequency);
}
};
class Trie
{
private:
TrieNode *root = NULL;
public:
Trie()
{
if (!root)
{
this->root = new TrieNode();
}
}
TrieNode* insert(string word)
{
if (!root)
root = new TrieNode();
TrieNode* current = root;
int length = word.length();
//insert "abc"
for(int i = 0; i < length; ++i)
{
if (current->getChild(word.at(i)) == NULL)
{
current->insert(word.at(i));
}
current = current->getChild(word.at(i));
}
current->setLeaf(true);
current->setWord(word);
current->setFrequency(current->getFrequency() + 1);
return current;
}
};
struct cmp
{
bool operator()(HeapNode* &a, HeapNode* &b)
{
return (a->frequency > b->frequency);
}
};
typedef priority_queue<HeapNode*, vector<HeapNode*>, cmp > MinHeap;
void insertUtils(Trie *root, MinHeap &pq, string word )
{
if (!root)
return;
TrieNode* current = root->insert(word);
HeapNode *heapNode = current->getHeapNode();
if(heapNode)// if word already present in heap
{
heapNode->frequency += 1;
}else if (pq.empty() || pq.size() < K_TH)
{// if word not present in heap and heap is not full;
heapNode = new HeapNode();
heapNode->word = word;
heapNode->frequency = 1;
heapNode->trieNode = current;
current->setHeapNode(heapNode);
pq.push(heapNode);
}else if (pq.top()->frequency < current->getFrequency())
{ // if word is not present and heap is full;
HeapNode *temp = pq.top();
//remove first element and add current word
pq.pop();
delete temp;
heapNode = new HeapNode();
current->setHeapNode(heapNode);
pq.push(heapNode);
}
}
void printKMostFrequentWords(vector<std::string> input)
{
Trie *root = new Trie();
MinHeap minHeap;
for (vector<string>::iterator it = input.begin(); it != input.end(); ++it)
{
insertUtils(root, minHeap, *it);
}
while(!minHeap.empty())
{
HeapNode *heapNode = minHeap.top();
cout << heapNode->word << ":" << heapNode->frequency << endl;
minHeap.pop();
}
}
int main() {
vector<std::string>input( {
"abc", "def", "ghi",
"jkl", "abc", "def",
"mno", "xyz", "abc"
} ) ;
printKMostFrequentWords(input);
}