Twitter和单行JSON输出

时间:2015-11-01 21:30:36

标签: python json twitter

我正在尝试收集所有(或在有限的Twitter firehose中)提及特定单词的推特,使用Twitter Streaming一天进入JSON文件。这是我的代码:

import re
import twitter
import numpy as np
import pandas as pd
import os
import json
import time
import datetime

q = 'JustinBieber'  #just for the sake of demonstration
max_time = 86400

CONSUMER_KEY = '...'
CONSUMER_SECRET = '...'
OAUTH_TOKEN = '...'
OAUTH_TOKEN_SECRET = '...'

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)

twitter_api = twitter.TwitterStream(auth=auth)

count = 0
max_iter = None

twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)

stream = twitter_stream.statuses.filter(track=q)

statuses = []

start = datetime.datetime.now()
startstr = str(start)
dummy = str(start.month)+str(start.day)+str(start.year)
jsonfile = str(q)+dummy+'.json'

with open(jsonfile, 'w') as f:

    try:
        for tweet in stream:

            json.dump(tweet, f)
            count += 1

            #if count % 100 == 0:  # for Justin Beiber
            if count % 5 == 0:     # for less popular

                print('{0} tweets fetched...'.format(count))

            now = datetime.datetime.now()

            if now - start > datetime.timedelta(0, 120, 0):
                break

            if max_iter is not None and count >= max_iter:
                #f.close()
                break

    except Exception as e:
        print(e)

f.close()

当我这样做时,我得到一个非常长的,单行的JSON文件,看起来像这样(对于不同的搜索词):

{"favorited": false, "contributors": null, "truncated": false, "text": "This Saturday is #Trailfest, 10k and 15m trail running races. Good luc
k to all of the runners participating! http://t.co/pxgPNn432c", "possibly_sensitive": false, "is_quote_status": false, "in_reply_to_status_id"
: null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 26342031, "verified"
: false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "profile_sidebar_fill_colo
r": "B00100", "profile_text_color": "999894", "followers_count": 1414, "profile_sidebar_border_color": "EA3001", "id_str": "26342031", "profil
e_background_color": "010002", "listed_count": 74, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/3539
72397/P1070158_-_Version_2_copy.jpg", "utc_offset": -21600, "statuses_count": 683, "description": "Pajarito Mountain Ski Area is friendly, cha
llenging and  authentic. The perfect place to spend some quality mountain time, close to Santa Fe and ABQ. #pajarito", "friends_count": 244, "
location": "Los Alamos, NM", "profile_link_color": "121111", "profile_image_url": "http://pbs.twimg.com/profile_images/550440833946628096/U5LL
94A0_normal.jpeg", "following": null, "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/26342031/1428964972",
 "profile_background_image_url": "http://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg", "name": "Pajarito M
ountain", "lang": "en", "profile_background_tile": false, "favourites_count": 30, "screen_name": "SkiPajarito", "notifications": null, "url": 
"http://www.skipajarito.com", "created_at": "Tue Mar 24 22:29:08 +0000 2009", "contributors_enabled": false, "time_zone": "Mountain Time (US &
 Canada)", "protected": false, "default_profile": false, "is_translator": false}, "filter_level": "low", "geo": null, "id": 653676227929210880
, "favorite_count": 0, "lang": "en", "entities": {"user_mentions": [], "symbols": [], "hashtags": [{"indices": [17, 27], "text": "Trailfest"}]
, "urls": [], "media": [{"expanded_url": "http://twitter.com/SkiPajarito/status/653676227929210880/photo/1", "display_url": "pic.twitter.com/p
xgPNn432c", "url": "http://t.co/pxgPNn432c", "media_url_https": "https://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg", "id_str": "6536762276104560
64", "sizes": {"small": {"h": 226, "resize": "fit", "w": 340}, "large": {"h": 681, "resize": "fit", "w": 1024}, "medium": {"h": 399, "resize":
 "fit", "w": 600}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [109, 131], "type": "photo", "id": 653676227610456064, "media_
url": "http://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg"}]}, "in_reply_to_user_id_str": null, "retweeted": false, "coordinates": null, "timestam
p_ms": "1444683532112", "source": "<a href=\"http://www.hootsuite.com\" rel=\"nofollow\">Hootsuite</a>", "in_reply_to_status_id_str": null, "i
n_reply_to_screen_name": null, "id_str": "653676227929210880", "extended_entities": {"media": [{"expanded_url": "http://twitter.com/SkiPajarit
o/status/653676227929210880/photo/1", "display_url": "pic.twitter.com/pxgPNn432c", "url": "http://t.co/pxgPNn432c", "media_url_https": "https:
//pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg", "id_str": "653676227610456064", "sizes": {"small": {"h": 226, "resize": "fit", "w": 340}, "large":
 {"h": 681, "resize": "fit", "w": 1024}, "medium": {"h": 399, "resize": "fit", "w": 600}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "i
ndices": [109, 131], "type": "photo", "id": 653676227610456064, "media_url": "http://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg"}]}, "place": nul
l, "retweet_count": 0, "created_at": "Mon Oct 12 20:58:52 +0000 2015", "in_reply_to_user_id": null}{"favorited": false, "contributors": null, 
"truncated": false, "text": "Sleep with a spoon? Pray to Ullr? Which of these rituals do you partake in? http://t.co/KCXMfsR318 @PowderMagazin
e http://t.co/JElQ95Qr6R", "possibly_sensitive": false, "is_quote_status": false, "in_reply_to_status_id": null, "user": {"follow_request_sent
": null, "profile_use_background_image": true, "default_profile_image": false, "id": 26342031, "verified": false, "profile_image_url_https": "
https://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "profile_sidebar_fill_color": "B00100", "profile_text_color": "
999894", "followers_count": 1417, "profile_sidebar_border_color": "EA3001", "id_str": "26342031", "profile_background_color": "010002", "liste
d_count": 74, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg",
 "utc_offset": -21600, "statuses_count": 684, "description": "Pajarito Mountain Ski Area is friendly, challenging and  authentic. The perfect 
place to spend some quality mountain time, close to Santa Fe and ABQ. #pajarito", "friends_count": 244, "location": "Los Alamos, NM", "profile
_link_color": "121111", "profile_image_url": "http://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "following": null,
 "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/26342031/1428964972", "profile_background_image_url": "htt
p://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg", "name": "Pajarito Mountain", "lang": "en", "profile_back
ground_tile": false, "favourites_count": 30, "screen_name": "SkiPajarito", "notifications": null,...

我试图以各种方式解析它,例如:

with open(filename, 'rb') as f:
    data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
data_df = pd.read_json(data_json_str)

这让我错误:

JSONDecodeError:额外数据:第1行第3792行 - 第1行第69900行(字符3791 - 69899)

对我可能做错了什么的想法?

1 个答案:

答案 0 :(得分:0)

您可以尝试这样做:

file = './Sample-2-Tweets.json'
    count = 0
    text  = ""
    if file != None:
        with open(file) as data_file:
            for row in data_file:`enter code here`
                data = json.loads(row)
                count = count + 1
                text = text + "\nTweet Created at: " + data['createdAt']['$date'] + "\nGeo-Location Latitude: " + str(data['geoLocation']['latitude']) + "\nGeo-Location Longitude: " + str(data['geoLocation']['longitude']) + "\nTweet Text " + data['text'] + "\nPlace Name: " + data['place']['name'] + "\nPlace Full Name: " + data['place']['fullName'] + "\n\n...... Next Tweet ........"
        scr.insert(tk.INSERT, text)
        print("Counter " + str(count))