当我在执行期间从文件系统加载一个小列表时,多处理器工作正常。 8个核心同时运行。但是,当我在执行期间从文件系统加载一个非常大的列表时,只运行了3个核心。我认为这是因为处理器正在等待将列表复制到每个处理器。
是否有强制处理器同时访问列表?
tokenization.py
def compute_customizedStopwords():
stopword_dictionary = open(BASE_DIR + "/app1/NLP/Dictionary/humanDecisionDictionary.txt",'r')
customizedStopwords = set()
# compute stopwords set
for line in stopword_dictionary:
customizedStopwords.add(line.strip('\n').lower()
return customizedStopwords
def tokenize_task(narrative, customizedStopwords)
tokens = narrative.corpus.split(",")
tokens = [token for token in tokens if token not in customizedStopwords] # remove stopwords
newTokenObjects = [ Token(token = token) for token in tokens]
Token.objects.bulk_create(newTokenObjects) # save all tokens to database
return tokens
views.py
def tokenize(request) :
narratives = models.Narrative.objects.all() # get all documents
customizedStopwords = compute_customizedStopwords() # get stopwords set
pool = Pool()
results = [pool.apply(tokenize_task, args=(narrative, customizedStopwords)) for narrative in narratives]
tokens = []
tokens += results # flat the token list
return HttpResponse(tokens)