2015年每个季度的代码块下面运行。一个季度的每个块大约需要3个小时才能运行(完整代码超过12个小时)。关于Multiprocessing如何在Python中工作,我不太了解。
如您所见,所有四个tweets_Q1,tweets_Q2,tweet_Q3和tweet_Q4块按顺序运行,但它们彼此独立。我想在不同的进程/线程上运行每个tweets_Q1,tweets_Q2,tweet_Q3和tweet_Q4块。我怎样才能做到这一点?
def finalQuarterlyAnalysis(sorted_all_tweets,industryDictionary,startyear = 2015):
resultDirectory = {}
#sorted_all_tweets is a dictionary of ('Date of Tweet':'Tweet')
#industryDictionary is a list of Training data set
#Runs for the first quarter of the year2
tweets_Q1 = [(key,value) for key, value in sorted_all_tweets if key > str(startyear)+'-01-01' and key < str(startyear)+'-03-31']
X1, vocab1 = createSparseMatrix(tweets_Q1)
print 'Q1 tweets vectorized %d tweets. found %d terms.' % (X1.shape[0], X1.shape[1])
Q1result = calculatePopularSectors(X1, vocab1, industryDictionary)
resultDirectory['1stQuarter 2015'] = Q1result
#Runs for the second quarter of the year
tweets_Q2 = [(key,value) for key, value in sorted_all_tweets if key > str(startyear)+'-04-01' and key < str(startyear)+'-06-30']
X2, vocab2 = createSparseMatrix(tweets_Q2)
print 'Q2 tweets vectorized %d tweets. found %d terms.' % (X2.shape[0], X2.shape[1])
Q2result = calculatePopularSectors(X2, vocab2, industryDictionary)
resultDirectory['2ndQuarter 2015'] = Q2result
#Runs for the third quarter of the year
tweets_Q3 = [(key,value) for key, value in sorted_all_tweets if key > str(startyear)+'-07-01' and key < str(startyear)+'-09-30']
X3, vocab3 = createSparseMatrix(tweets_Q3)
print 'Q3 tweets vectorized %d tweets. found %d terms.' % (X3.shape[0], X3.shape[1])
Q3result = calculatePopularSectors(X3, vocab3, industryDictionary)
resultDirectory['3rdQuarter 2015'] = Q3result
#Runs for the fourth quarter of the year
tweets_Q4 = [(key,value) for key, value in sorted_all_tweets if key > str(startyear)+'-10-01' and key < str(startyear)+'-12-31']
X4, vocab4 = createSparseMatrix(tweets_Q4)
print 'Q1 tweets vectorized %d tweets. found %d terms.' % (X4.shape[0], X4.shape[1])
Q4result = calculatePopularSectors(X4, vocab4, industryDictionary)
resultDirectory['4thQuarter 2015'] = Q4result
return resultDirectory
resultDirectory = finalQuarterlyAnalysis(sorted_all_tweets,industryDictionary,startyear = 2015)