我已经得到了以下脚本,它将根据停用词的数量来检测语言。然后,它将调用另一个脚本进行标记。但是,我似乎无法弄清楚如何交换正在加载的语料库(目前是它的棕色):
tagger = UnigramTagger(nltk.corpus.brown.tagged_sents())
,具体取决于检测到的语言。有没有我可以打电话的方法?或者是在初始设置时打开输入,还是为每种语言设置了多个进程?
#!/ usr / bin / env python #coding:UTF_8
"""
Imports
"""
import sys
import getopt
import socket
import select
import pprint
import json
import subprocess
try:
from nltk import wordpunct_tokenize
from nltk.tag import UnigramTagger
from nltk.corpus import stopwords
except ImportError:
print '[!] Missing NLTK packages. Attempting to install. This may take a while!'
subprocess.call('python scripts/SetupNltk.py', shell=True)
"""
Globals
"""
"""
Definitions
"""
def _calculate_languages_ratio(text):
languages_ratios = {}
tokens = wordpunct_tokenize(text)
words = [word.lower() for word in tokens]
for language in stopwords.fileids():
stopwords_set = set(stopwords.words(language))
words_set = set(words)
common_elements = words_set.intersection(stopwords_set)
languages_ratios[language] = len(common_elements)
return languages_ratios
def detect_language(text):
ratios = _calculate_languages_ratio(text)
return max(ratios, key=ratios.get)
"""
Main entry point
"""
if __name__ == '__main__':
language = detect_language(text)
print language