伙计们,我正在编写这个程序,通过推文列表并返回最常用的单词。
我想让它更快但我想知道你是否可以帮助指出一些我可以提高速度的问题或领域。 谢谢
#import string
import re
from string import punctuation
from operator import itemgetter
import pprint
class Tweet:
def __init__(self, timestamp, userId, message):
self.timestamp = timestamp
self.userId = userId
self.message = message
def getDate(self):
tokens = re.split(' ', self.timestamp)
return tokens[0]
def __repr__(self):
return "[timestamp=%s userId=%s message=%s]" % (self.timestamp, self.userId, self.message)
outfile = file
def readOneTweet(file):
""" Reads a single tweet from the file, and returns the string containing the tweet.
This will often just be a single line from the file, but may be more if it ends with a slash.
"""
lineBuffer = ""
while True:
# TODO: read the line and strip it
rawLine = file.readline().strip('\n')
if (len(rawLine)== 0):
break
lineBuffer +=rawLine
if (rawLine[(len(rawLine)-1)]!= "\\"):
break
return lineBuffer
def readTweets():
tweets = []
inputfile = raw_input("Enter filename: ")
# move the try / except around a single tweet.
# so that we can keep going if we encounter a line with an error.
try:
f = open(inputfile , "r")
while True:
tweet = readOneTweet(f) # readOneTweet is method
if not tweet:
break
try:
lineStrip = tweet.rstrip()
split_word = re.split('\t', lineStrip.lower()) #('/([^a-z])([A-Z]) ([0-9])/n:.;\]+/', line.lower())
tweetTime = split_word[1]
userId = split_word[0]
message = split_word[2]
tweets.append(Tweet(tweetTime, userId, message))
if len(tweets) % 10000 == 0:
print 'read', len(tweets), 'tweets'
except IndexError, e:
print "bad tweet", tweet
except IOError:
print "file not found!"
return tweets
######################DATA ##############
"""
- Need to separate tweets
- Obtain information about each tweet - UserID, Time, words
"""
def writeWordFile(word):
toWrite = 'test.txt'
fileHandle = open ( toWrite, 'w' )
for i in word:
fileHandle.write (i)
def dailyMessages(twt):
dailyMsg =dict ()
for i in twt:
date =i.getDate()
#print i.message
#dailyMsg[date] =messageList
if dailyMsg.has_key(date):
dailyMsg[date].append(twt)
else:
dailyMsg[date] =[twt]
#for k, v in dailyMsg.items():
#print k, v, '\n'
return dailyMsg
"""
Takes dailyTweets and perform word coun.
"""
def dailyWord(tweetsByDay):
dailyTweetsWordCount = { }
for date in tweetsByDay.keys():
dayTweets =tweetsByDay[date]
if len(dayTweets) != 0:
count = wordCount(dayTweets)
dailyTweetsWordCount[date] = count
return dailyTweetsWordCount
def wordCount(tweets):
"""Takes a list of tweets and returns a dictionary of counts for words"""
N = 100
# we'll build a dictionary mapping each word to a SET of users who have used it
wordTweeters = {}
for tweet in tweets:
# print tweet
for i in tweet:
for word in i.message.split():
if not wordTweeters.has_key(word):
wordTweeters[word] = set()
wordTweeters[word].add(i.userId)
# we'll build up a dictionary mapping each word to the number of users who have used it.
p = dict ()
#print wordTweeters
for day in wordTweeters.keys():
usersNo = len (wordTweeters[day])
p[day] = usersNo
#print wordTweeters
return p #wordTweeters, p
def searchForMemes(tweetUserCounts):
for key in tweetsUserCounts.keys():
# for pmeme in tweetUserCounts
pass
"""Takes information returned by daily word"""
def isMeme(word, day1Count, day2Count, day3Count):
#takes the daily count
# check if it is a meme
#First - check count
#check count in different days
# determine the if it qualifies as a tweet
# if not drop it do not do below checks
#Second - time stamp
#CHECK ITS TIME TRACK
#How is the count of over time
# rise and fall
#
#Third - user id
# check if is form different users
#how many of those counts are from different users
pass
def dayUserCount(z,word, d1, d2, d3):
""" assume dictionary will be input"""
# check if the word exist in the dictionary
if z.has_key(d1):
date1 =z[d1]
#print value.keys()
if date1.has_key(word):
print date1
c1 =date1[word]
else:
print "word not used in %s"%d1
c1 =0
else:
print 'date does not exist'
if z.has_key(d2):
#print value.keys()
date2 =z[d2]
if date2.has_key(word):
print date2
c2 =date2[word]
else:
print "word not used in %s"%d2
c2 =0
else:
print 'date does not exist'
if z.has_key(d3):
date3 = z[d3]
if date3.has_key(word):
print date3
c3 =date3[word]
else:
print "word not used in %s" %d3
c3 =0
else:
print 'date does not exist'
result = "Word: %s , %s count: %s, %s count: %s, %s count: %s"%(word,d1,c1,d2,c2, d3,c3)
return result
# supportive functions
def hashtag(tw):
hasht =[]
for word in tw.split():
if word.startswith('#'):
hasht.append(word)
return hasht
def httpTag(tw):
http =[]
for word in tw.split():
if word.startswith('http'):
http.append(word)
return http
def reply(tw):
reply =[]
for word in tw.split():
if word.startswith('@'):
reply.append(word)
return reply
def reTweet(tw):
rt =[]
for word in tw.split():
if word.startswith('rt') or word.startswith('RT'):
rt.append(word)
return rt
"""
Old functions
"""
def writeToFile(tweet):
#filename = test.txt
filek = open('test.txt', 'w')
print "writing on the file: "
filek.write(tweet)
# print " _____--------______" + tweet
filek.close()
# count word frequency.
def f2count():
N = 100000000000
words = {}
words_gen = (word.strip(punctuation).lower()
for line in open('c.txt')
for word in line.split())
for word in words_gen:
words[word] = words.get(word, 0) + 1
top_words = sorted(words.iteritems(),
key=lambda(word, count): (-count, word))[:N]
for word, frequency in top_words:
print "%s: %d" % (word, frequency)
答案 0 :(得分:7)
if (len(rawline) == 0):
可以写成
if rawline:
您不应该使用len(rawline) - 1
作为索引,只需使用rawline[-1]
。
我不知道为什么你使用re.split()
,只能做linestip.lower().split('\t')
。
请勿使用dailyMsg.has_key(date)
,请使用date in dailyMsg
。
当您遍历tweetsByDay
时,您应该这样做:
for date, value in tweetsByDay.items():`
这样您就不必手动将值绑定到密钥。
这只是一个开始。还有许多问题需要解决。我认为你真的只需要掌握Python - 通过阅读你的代码可以清楚地看出,Python不是你的第一语言,或者你是从一个没有教你如何写好它的资源中学到的。例如,为什么要在括号内加括号?这在Python中不是必需的(尽管它是来自类似Algol的语言(如C或Java)的工件)。为什么使用dict()
代替{}
?第二种方式最好写一个空字典。您可能会发现this tutorial on idiomatic Python有帮助。
答案 1 :(得分:1)
wordCount()
可以并行运行。由于每条推文并不直接依赖于另一条推文,因此没有理由按顺序迭代列表。将推文列表分成较小的列表,然后在每个子列表上使用一个线程。一旦他们完成了所有子字典的创建,你可以做一些工作将它们全部合并到一个字典中。
修改强>
如何平行对列表求和的示例。您可以更改线程的主体以执行您的任务。
from threading import Thread
numbers = range(1000)
class Sum(Thread):
def __init__(self, numList):
Thread.__init__(self)
self.numList = numList
self.total = 0
def run(self):
for num in self.numList:
self.total += num
numThreads = 7
threads = []
perThread = len(numbers)/numThreads
for i in xrange(numThreads):
start = i*perThread
t = Sum(numbers[start:len(numbers) if i == numThreads-1 else start+perThread])
t.start()
threads.append(t)
grandTotal = 0
for t in threads:
t.join()
grandTotal += t.total
print grandTotal
答案 2 :(得分:0)
此代码包含非优化代码段。
例如,函数的每次调用都需要时间。不要做一些无用的功能调用,你会有空的时间。 一些调用可以用理解列表替换:hashtag,httpTag,... etc
我可以帮助优化,但是:
1 - 我目前没有足够的时间从事这类长期工作
2 - 我们无法优化,因为代码不完整:以下函数在哪里调用? :
readTweets
writeWordFile
dailyMessages
dailyWord
wordCount
searchForMemes
isMeme
dayUserCount
hashtag
httpTag
reply
reTweet
writeToFile
f2count
3 - 我很难回答新登记的民众,他们遇到了严重的问题,并在此之后消失,有时候不会留下任何新闻或评论。如果你打算不这样做,请原谅我
将writeToFile
f2count
显然必须从列表中删除。