在tweepy.StreamListener on_data方法中过滤推文

时间:2016-01-29 02:05:38

标签: filter tweepy tweets

从许多关于堆栈溢出的文章中了解到,tweepy.streaming.stream类中的filter方法对 track location arguements使用逻辑OR

因此,以下内容将返回来自location = USA的推文或带有"""

streamObj = tweepy.streaming.Stream(oauthObject
                              ,EchoStreamListener(api=apiInstance,
                                                 dump_json=args.json,                                                                   
                                                 numtweets=args.numtweets))
keyWordList = ['panthers','falcon']
GEOBOX_USA = [-125,25.1,-60.5,49.1]
streamObj.filter(locations=GEOBOX_USA, track=keyWordList, languages=['en'])

此解决方案(How to add a location filter to tweepy module )检查 on_status 方法中的关键字效果很好,但如果我需要存储整个json变量,我想我必须使用 on_data

所以更改了on_data(如下面的代码所示),但是收到错误:

  File "/Library/Python/2.7/site-packages/tweepy/streaming.py", line 294, in _run
    raise exception
KeyError: 'text' 

- - 编码:utf-8 - -

from types import *
import tweepy
import json
import argparse
import io

class EchoStreamListener(tweepy.StreamListener):
    def __init__(self, api, dump_json=False, numtweets=0):
        self.api = api
        self.dump_json = dump_json
        self.count = 0
        self.limit = int(numtweets)

        super(tweepy.StreamListener, self).__init__()

# def on_status(self, status):
#     if any(keyWord in status.text.lower() for keyWord in keyWordList):
#         print status.text
#
#         self.count+=1
#         return False if self.count == self.limit else True
#     else:
#         return True # Don't kill the stream

    def on_data(self, tweet):
        tweet_data = json.loads(tweet)  # This allows the JSON data be used as a normal dictionary:

        if any(keyWord in tweet_data['text'] for keyWord in keyWordList):
            if self.dump_json:
                print json.dumps(tweet_data)
                saveFile.write(unicode(tweet) + "\n")

                self.count+=1
                return False if self.count == self.limit else True
            else:
                print tweet_data['created_at','name','text'].encode("utf-8").rstrip()

    def on_error(self, status_code):
        print >> sys.stderr, 'Encountered error with status code:', status_code
        return True 

def get_parser():
    parser = argparse.ArgumentParser(add_help=True)
    group = parser.add_mutually_exclusive_group(required=True)

    group.add_argument(
        '-j', '--json',
        action='store_true',
        help='dump each tweet as a json string'
    )
    group.add_argument(
        '-t', '--text',
        dest='json',
        action='store_false',
        help='dump each tweet\'s text'
    )
    parser.add_argument(
        '-n', '--numtweets',
        metavar='numtweets',
        help='set number of tweets to retrieve'
    )
    return parser

if __name__ == '__main__':

    oauthObject = tweepy.OAuthHandler(myconsumer_key, myconsumer_secret)
    oauthObject.set_access_token(myaccess_key,myaccess_secret)

    apiInstance = tweepy.API(oauthObject) 
    parser = get_parser()
    args = parser.parse_args()

    streamObj = tweepy.streaming.Stream(oauthObject
                                       ,EchoStreamListener(api=apiInstance,
                                                           dump_json=args.json,
                                                       numtweets=args.numtweets))

    keyWordList = ['panthers','falcon']
    GEOBOX_USA = [-125,25.1,-60.5,49.1]
    saveFile = io.open('/Users/deepaktanna/raw_tweets.json', 'w', encoding='utf-8')

    streamObj.filter(locations=GEOBOX_USA, languages=['en'])

    saveFile.close()

0 个答案:

没有答案