我正在尝试收集所有(或在有限的Twitter firehose中)提及特定单词的推特,使用Twitter Streaming一天进入JSON文件。这是我的代码:
import re
import twitter
import numpy as np
import pandas as pd
import os
import json
import time
import datetime
q = 'JustinBieber' #just for the sake of demonstration
max_time = 86400
CONSUMER_KEY = '...'
CONSUMER_SECRET = '...'
OAUTH_TOKEN = '...'
OAUTH_TOKEN_SECRET = '...'
auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.TwitterStream(auth=auth)
count = 0
max_iter = None
twitter_stream = twitter.TwitterStream(auth=twitter_api.auth)
stream = twitter_stream.statuses.filter(track=q)
statuses = []
start = datetime.datetime.now()
startstr = str(start)
dummy = str(start.month)+str(start.day)+str(start.year)
jsonfile = str(q)+dummy+'.json'
with open(jsonfile, 'w') as f:
try:
for tweet in stream:
json.dump(tweet, f)
count += 1
#if count % 100 == 0: # for Justin Beiber
if count % 5 == 0: # for less popular
print('{0} tweets fetched...'.format(count))
now = datetime.datetime.now()
if now - start > datetime.timedelta(0, 120, 0):
break
if max_iter is not None and count >= max_iter:
#f.close()
break
except Exception as e:
print(e)
f.close()
当我这样做时,我得到一个非常长的,单行的JSON文件,看起来像这样(对于不同的搜索词):
{"favorited": false, "contributors": null, "truncated": false, "text": "This Saturday is #Trailfest, 10k and 15m trail running races. Good luc
k to all of the runners participating! http://t.co/pxgPNn432c", "possibly_sensitive": false, "is_quote_status": false, "in_reply_to_status_id"
: null, "user": {"follow_request_sent": null, "profile_use_background_image": true, "default_profile_image": false, "id": 26342031, "verified"
: false, "profile_image_url_https": "https://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "profile_sidebar_fill_colo
r": "B00100", "profile_text_color": "999894", "followers_count": 1414, "profile_sidebar_border_color": "EA3001", "id_str": "26342031", "profil
e_background_color": "010002", "listed_count": 74, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/3539
72397/P1070158_-_Version_2_copy.jpg", "utc_offset": -21600, "statuses_count": 683, "description": "Pajarito Mountain Ski Area is friendly, cha
llenging and authentic. The perfect place to spend some quality mountain time, close to Santa Fe and ABQ. #pajarito", "friends_count": 244, "
location": "Los Alamos, NM", "profile_link_color": "121111", "profile_image_url": "http://pbs.twimg.com/profile_images/550440833946628096/U5LL
94A0_normal.jpeg", "following": null, "geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/26342031/1428964972",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg", "name": "Pajarito M
ountain", "lang": "en", "profile_background_tile": false, "favourites_count": 30, "screen_name": "SkiPajarito", "notifications": null, "url":
"http://www.skipajarito.com", "created_at": "Tue Mar 24 22:29:08 +0000 2009", "contributors_enabled": false, "time_zone": "Mountain Time (US &
Canada)", "protected": false, "default_profile": false, "is_translator": false}, "filter_level": "low", "geo": null, "id": 653676227929210880
, "favorite_count": 0, "lang": "en", "entities": {"user_mentions": [], "symbols": [], "hashtags": [{"indices": [17, 27], "text": "Trailfest"}]
, "urls": [], "media": [{"expanded_url": "http://twitter.com/SkiPajarito/status/653676227929210880/photo/1", "display_url": "pic.twitter.com/p
xgPNn432c", "url": "http://t.co/pxgPNn432c", "media_url_https": "https://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg", "id_str": "6536762276104560
64", "sizes": {"small": {"h": 226, "resize": "fit", "w": 340}, "large": {"h": 681, "resize": "fit", "w": 1024}, "medium": {"h": 399, "resize":
"fit", "w": 600}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "indices": [109, 131], "type": "photo", "id": 653676227610456064, "media_
url": "http://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg"}]}, "in_reply_to_user_id_str": null, "retweeted": false, "coordinates": null, "timestam
p_ms": "1444683532112", "source": "<a href=\"http://www.hootsuite.com\" rel=\"nofollow\">Hootsuite</a>", "in_reply_to_status_id_str": null, "i
n_reply_to_screen_name": null, "id_str": "653676227929210880", "extended_entities": {"media": [{"expanded_url": "http://twitter.com/SkiPajarit
o/status/653676227929210880/photo/1", "display_url": "pic.twitter.com/pxgPNn432c", "url": "http://t.co/pxgPNn432c", "media_url_https": "https:
//pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg", "id_str": "653676227610456064", "sizes": {"small": {"h": 226, "resize": "fit", "w": 340}, "large":
{"h": 681, "resize": "fit", "w": 1024}, "medium": {"h": 399, "resize": "fit", "w": 600}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "i
ndices": [109, 131], "type": "photo", "id": 653676227610456064, "media_url": "http://pbs.twimg.com/media/CRJTENDWsAAy3Eg.jpg"}]}, "place": nul
l, "retweet_count": 0, "created_at": "Mon Oct 12 20:58:52 +0000 2015", "in_reply_to_user_id": null}{"favorited": false, "contributors": null,
"truncated": false, "text": "Sleep with a spoon? Pray to Ullr? Which of these rituals do you partake in? http://t.co/KCXMfsR318 @PowderMagazin
e http://t.co/JElQ95Qr6R", "possibly_sensitive": false, "is_quote_status": false, "in_reply_to_status_id": null, "user": {"follow_request_sent
": null, "profile_use_background_image": true, "default_profile_image": false, "id": 26342031, "verified": false, "profile_image_url_https": "
https://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "profile_sidebar_fill_color": "B00100", "profile_text_color": "
999894", "followers_count": 1417, "profile_sidebar_border_color": "EA3001", "id_str": "26342031", "profile_background_color": "010002", "liste
d_count": 74, "profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg",
"utc_offset": -21600, "statuses_count": 684, "description": "Pajarito Mountain Ski Area is friendly, challenging and authentic. The perfect
place to spend some quality mountain time, close to Santa Fe and ABQ. #pajarito", "friends_count": 244, "location": "Los Alamos, NM", "profile
_link_color": "121111", "profile_image_url": "http://pbs.twimg.com/profile_images/550440833946628096/U5LL94A0_normal.jpeg", "following": null,
"geo_enabled": false, "profile_banner_url": "https://pbs.twimg.com/profile_banners/26342031/1428964972", "profile_background_image_url": "htt
p://pbs.twimg.com/profile_background_images/353972397/P1070158_-_Version_2_copy.jpg", "name": "Pajarito Mountain", "lang": "en", "profile_back
ground_tile": false, "favourites_count": 30, "screen_name": "SkiPajarito", "notifications": null,...
我试图以各种方式解析它,例如:
with open(filename, 'rb') as f:
data = f.readlines()
data = map(lambda x: x.rstrip(), data)
data_json_str = "[" + ','.join(data) + "]"
data_df = pd.read_json(data_json_str)
这让我错误:
JSONDecodeError:额外数据:第1行第3792行 - 第1行第69900行(字符3791 - 69899)
对我可能做错了什么的想法?
答案 0 :(得分:0)
您可以尝试这样做:
file = './Sample-2-Tweets.json'
count = 0
text = ""
if file != None:
with open(file) as data_file:
for row in data_file:`enter code here`
data = json.loads(row)
count = count + 1
text = text + "\nTweet Created at: " + data['createdAt']['$date'] + "\nGeo-Location Latitude: " + str(data['geoLocation']['latitude']) + "\nGeo-Location Longitude: " + str(data['geoLocation']['longitude']) + "\nTweet Text " + data['text'] + "\nPlace Name: " + data['place']['name'] + "\nPlace Full Name: " + data['place']['fullName'] + "\n\n...... Next Tweet ........"
scr.insert(tk.INSERT, text)
print("Counter " + str(count))