Question

我正在观看https://www.youtube.com/watch?v=wlnx-7cm4Gg&list=PL5tcWHG-UPH2zBfOz40HSzcGUPAVOOnu1系列，这是关于使用tweepy（python）挖掘推文，并且该人存储了包含所有内容（例如created_at，id，id_str，text）的推文，然后他在熊猫中使用了数据帧仅存储文本。这样有效吗？如何仅将“文本”存储在Json文件中，而不存储所有其他详细信息？

代码：

ACCESS_TOKEN = "xxxxxxxxxxxxxxxxxxxxx"
ACCESS_TOKEN_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_KEY = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
CONSUMER_SECRET = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

import tweepy
import numpy as np
import pandas as pd
# import twitter_credentials

class TwitterAuthenticator():
    def authenticate_twitter_app(self):
        auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
        auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
        return auth

class TwitterStreamer():
    """
        Class for streaming and processing live tweets.
    """
    def __init__(self):
        self.twitter_authenticator = TwitterAuthenticator()
    def stream_tweets(self, fetched_tweets_filename, hash_tag):
        # This handles Twitter authetification and the connection to Twitter Streaming API
        listener = TwitterListener(fetched_tweets_filename)
        auth = self.twitter_authenticator.authenticate_twitter_app()
        # api = tweepy.API(auth)


        stream = tweepy.Stream(auth,listener)
        stream.filter(track = hash_tag)


class TwitterListener(tweepy.StreamListener):
    """
    This is a basic listener class that just prints received tweets to stdout.
    """

    def __init__(self, fetched_tweets_filename):
        self.fetched_tweets_filename = fetched_tweets_filename

    def on_data(self, data):
        try:
            print(data)
            with open(self.fetched_tweets_filename, 'a') as tf:
                tf.write(data)
            return True
        except BaseException as e:
            print("Error on_data %s" % str(e))
        return True

    def on_status(self, status):
        print(status)

    def on_error(self, status):
        if status == 420:
            # Returning False on_data method in case rate limit occurs.
            return False
        print(status)


# public_tweets = api.home_timeline()
# for tweet in public_tweets:
#     print tweet.text

if __name__ == '__main__':
    hash_tag = ["python"]
    fetched_tweets_filename = "tweets.json"

    twitter_streamer = TwitterStreamer()
    twitter_streamer.stream_tweets(fetched_tweets_filename,hash_tag)

    # print stream.text

存储在json文件中的推文：

{"created_at":"Sun Nov 04 18:43:59 +0000 2018","id":1059154305498972160,"id_str":"1059154305498972160","text":"RT @hmason: When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn h\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":14858491,"id_str":"14858491","name":"Alexandra Lemus","screen_name":"nankyoku","location":"M\u00e9xico","url":null,"description":"Transitioning into the Permanent Beta state...","translator_type":"none","protected":false,"verified":false,"followers_count":173,"friends_count":585,"listed_count":18,"favourites_count":658,"statuses_count":572,"created_at":"Wed May 21 16:35:49 +0000 2008","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"EDECE9","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme3\/bg.gif","profile_background_tile":false,"profile_link_color":"088253","profile_sidebar_border_color":"D3D2CF","profile_sidebar_fill_color":"E3E2DE","profile_text_color":"634047","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/378800000575875952\/f00390453684dd243d7ca95c69a05f74_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/14858491\/1381524599","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat Nov 03 17:36:24 +0000 2018","id":1058774912201035776,"id_str":"1058774912201035776","text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yoursel\u2026 https:\/\/t.co\/9F7SmlGfyf","source":"\u003ca href=\"http:\/\/twitter.com\" rel=\"nofollow\"\u003eTwitter Web Client\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":765548,"id_str":"765548","name":"Hilary Mason","screen_name":"hmason","location":"NYC","url":"http:\/\/www.hilarymason.com","description":"GM for Machine Learning at @Cloudera. Founder at @FastForwardLabs. Data Scientist in Residence at @accel. I \u2665 data and cheeseburgers.","translator_type":"none","protected":false,"verified":true,"followers_count":111311,"friends_count":1539,"listed_count":5276,"favourites_count":12049,"statuses_count":17602,"created_at":"Sun Feb 11 21:22:24 +0000 2007","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"282F8A","profile_sidebar_border_color":"87BC44","profile_sidebar_fill_color":"AB892B","profile_text_color":"000000","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/948689418709323777\/sTBM3vG0_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/765548\/1353033581","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"When you want to use a new algorithm that you don't deeply understand, the best approach is to implement it yourself to learn how it works, and then use a library to benefit from robust code.\n\nHere's one article showing this with neural networks in Python: https:\/\/t.co\/3ehO86NFKI","display_text_range":[0,280],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/3ehO86NFKI","expanded_url":"https:\/\/towardsdatascience.com\/how-to-build-your-own-neural-network-from-scratch-in-python-68998a08e4f6","display_url":"towardsdatascience.com\/how-to-build-y\u2026","indices":[257,280]}],"user_mentions":[],"symbols":[]}},"quote_count":14,"reply_count":8,"retweet_count":290,"favorite_count":1019,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/9F7SmlGfyf","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1058774912201035776","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"hmason","name":"Hilary Mason","id":765548,"id_str":"765548","indices":[3,10]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1541357039223"}

如果问题不清楚，请注释掉，我将尝试编辑问题。

Answer 1

看起来您从API中获得的并存储在变量“数据”中的是json格式的unicode文本。您只是将文本直接写到文件中。使用您执行的API调用，您将始终获取所有数据，因此效率不是那么低。如果您只想获取/编写推文的文本，请尝试使用json load，然后从那里进行处理。

Answer 2

如果只希望将“文本”字段保存在json文件中，则可以调整TwitterListener.on_data方法的定义：

import json

def on_data(self, data):
    try:
        print(data)
        with open(self.fetched_tweets_filename, 'a') as tf:
            json_load = json.loads(data)
            text = {'text': json_load['text']}
            tf.write(json.dumps(text))
        return True
    except BaseException as e:
        print("Error on_data %s" % str(e))
    return True

一般警告，我没有安装/设置tweepy，所以我只能使用您在上面发布的json文件测试上述代码的版本。如果您遇到任何错误，请告诉我，我会解决的。

如何使用Tweepy仅存储推文的文本

2 个答案: