output_rdpartition = mp.Queue()
def read_partition_zipfile(infile,stop_words,startline,endline):
# endline = startline + 100
chunk_user_d = defaultdict(lambda: defaultdict(list))
chunk_user_withoutstamp_d = defaultdict(list)
with gzip.open(in_file, "rb") as f:
for j, line in enumerate(f):
if j >= startline and j < endline:
if j%10000==0 : print "processed",j,"lines"
line = line[:-1].split("|:|")
time_stamp = int(line[0])
user_id = line[-1]
keywords=line[1].split(',')
keywords = [item.lower() for item in keywords if len(item)>=2]
keywords = [item for item in keywords if item not in stop_words]
# print 'user_id', user_id
# print 'time_stamp', time_stamp
# print 'keywords',keywords
chunk_user_d[user_id][time_stamp] += keywords
chunk_user_withoutstamp_d[user_id] +=keywords
# print chunk_user_withoutstamp_d,'chunk_user_withoutstamp_d'
# return chunk_user_d, chunk_user_withoutstamp_d
output_rdpartition.put((chunk_user_d,chunk_user_withoutstamp_d))
def main():
start_time = datetime.datetime.now()
print("at the start of main")
user_id ='1ss7fef4'
lenth = 0
tf_idf = defaultdict(int)
key_dic = defaultdict(float)
time_latest = 0
processes_rd = [mp.Process(target = read_partition_zipfile, args =(in_file, stop_words, p_index[j], p_index[j+1])) for j in range(0,3)]
for p in processes_rd:
p.start()
results_rd = [output_rdpartition.get() for p in processes_rd]
# results_rd[0]is the chunkuser ,results_rd[1]is the chunkuser_without stamp
print results_rd
if __name__ == '__main__':
stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
stop_words = stop_words.split(",")
in_file = 'uniq.txt.gz'
p_index = range(0,28000000,2800000)
main()
似乎是因为队列问题,我可以在函数内打印,但是我无法返回函数的输出