我正在使用此wonderful resource从Twitter获取推文。
Twitter推送推文,这个脚本抓取它们并在终端中显示它们。
我为初学者问题道歉,但是如何限制使用此脚本收到的推文数量或所有推文的总大小?
我知道我可以将这些内容写入文件并观察文件的大小,但是是否有办法计算收到的字节数或收到的推文数?
该脚本位于以下链接中。
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2012 Gustav Arngården
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import time
import pycurl
import urllib
import json
import oauth2 as oauth
API_ENDPOINT_URL = 'https://stream.twitter.com/1.1/statuses/filter.json'
USER_AGENT = 'TwitterStream 1.0' # This can be anything really
# You need to replace these with your own values
OAUTH_KEYS = {'consumer_key': <Consumer key>,
'consumer_secret': <Consumer secret>,
'access_token_key': <Token key>,
'access_token_secret': <Token secret>}
# These values are posted when setting up the connection
POST_PARAMS = {'include_entities': 0,
'stall_warning': 'true',
'track': 'iphone,ipad,ipod'}
class TwitterStream:
def __init__(self, timeout=False):
self.oauth_token = oauth.Token(key=OAUTH_KEYS['access_token_key'], secret=OAUTH_KEYS['access_token_secret'])
self.oauth_consumer = oauth.Consumer(key=OAUTH_KEYS['consumer_key'], secret=OAUTH_KEYS['consumer_secret'])
self.conn = None
self.buffer = ''
self.timeout = timeout
self.setup_connection()
def setup_connection(self):
""" Create persistant HTTP connection to Streaming API endpoint using cURL.
"""
if self.conn:
self.conn.close()
self.buffer = ''
self.conn = pycurl.Curl()
# Restart connection if less than 1 byte/s is received during "timeout" seconds
if isinstance(self.timeout, int):
self.conn.setopt(pycurl.LOW_SPEED_LIMIT, 1)
self.conn.setopt(pycurl.LOW_SPEED_TIME, self.timeout)
self.conn.setopt(pycurl.URL, API_ENDPOINT_URL)
self.conn.setopt(pycurl.USERAGENT, USER_AGENT)
# Using gzip is optional but saves us bandwidth.
self.conn.setopt(pycurl.ENCODING, 'deflate, gzip')
self.conn.setopt(pycurl.POST, 1)
self.conn.setopt(pycurl.POSTFIELDS, urllib.urlencode(POST_PARAMS))
self.conn.setopt(pycurl.HTTPHEADER, ['Host: stream.twitter.com',
'Authorization: %s' % self.get_oauth_header()])
# self.handle_tweet is the method that are called when new tweets arrive
self.conn.setopt(pycurl.WRITEFUNCTION, self.handle_tweet)
def get_oauth_header(self):
""" Create and return OAuth header.
"""
params = {'oauth_version': '1.0',
'oauth_nonce': oauth.generate_nonce(),
'oauth_timestamp': int(time.time())}
req = oauth.Request(method='POST', parameters=params, url='%s?%s' % (API_ENDPOINT_URL,
urllib.urlencode(POST_PARAMS)))
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), self.oauth_consumer, self.oauth_token)
return req.to_header()['Authorization'].encode('utf-8')
def start(self):
""" Start listening to Streaming endpoint.
Handle exceptions according to Twitter's recommendations.
"""
backoff_network_error = 0.25
backoff_http_error = 5
backoff_rate_limit = 60
while True:
self.setup_connection()
try:
self.conn.perform()
except:
# Network error, use linear back off up to 16 seconds
print 'Network error: %s' % self.conn.errstr()
print 'Waiting %s seconds before trying again' % backoff_network_error
time.sleep(backoff_network_error)
backoff_network_error = min(backoff_network_error + 1, 16)
continue
# HTTP Error
sc = self.conn.getinfo(pycurl.HTTP_CODE)
if sc == 420:
# Rate limit, use exponential back off starting with 1 minute and double each attempt
print 'Rate limit, waiting %s seconds' % backoff_rate_limit
time.sleep(backoff_rate_limit)
backoff_rate_limit *= 2
else:
# HTTP error, use exponential back off up to 320 seconds
print 'HTTP error %s, %s' % (sc, self.conn.errstr())
print 'Waiting %s seconds' % backoff_http_error
time.sleep(backoff_http_error)
backoff_http_error = min(backoff_http_error * 2, 320)
def handle_tweet(self, data):
""" This method is called when data is received through Streaming endpoint.
"""
self.buffer += data
if data.endswith('\r\n') and self.buffer.strip():
# complete message received
message = json.loads(self.buffer)
self.buffer = ''
msg = ''
if message.get('limit'):
print 'Rate limiting caused us to miss %s tweets' % (message['limit'].get('track'))
elif message.get('disconnect'):
raise Exception('Got disconnect: %s' % message['disconnect'].get('reason'))
elif message.get('warning'):
print 'Got warning: %s' % message['warning'].get('message')
else:
print 'Got tweet with text: %s' % message.get('text')
if __name__ == '__main__':
ts = TwitterStream()
ts.setup_connection()
ts.start()
答案 0 :(得分:2)
为什么不在__init__
中使用计数器:
def __init__(self, timeout=False):
...
self.tweetsSoFar = 0
def handleTweet(self, data):
...
else:
self.tweetsSoFar += 1 # we've seen another tweet!
print 'Got tweet with text: %s' % message.get('text')
def start(self):
...
while self.tweetsSoFar < 50: # stop at 50 tweets
...
希望这有帮助
答案 1 :(得分:1)
我对API不太熟悉,但假设每条完整的消息代表一条推文,那么你可以做的就是先在课堂上添加一个计数器:
self.tweets = 0
然后在收到完整邮件后在handle_tweet
中将其递增:
if data.endswith('\r\n') and self.buffer.strip():
# complete message received
message = json.loads(self.buffer)
...
...
else:
print 'Got tweet with text: %s' % message.get('text')
self.tweets += 1
if self.tweets == SOME_LIMIT:
self.conn.close() # or exit()