我使用免费的1%流将大量的Twitter数据流式传输到S3存储桶中,然后将其下载以进行某些分析。
我注意到当我使用www.jsonlint.com时,我的一些JSON数据是无效的JSON,经过一些挖掘后,我发现这是因为有些推文并没有全部用换行符分隔。
有人能指出我正确的方向来解决这个问题吗?我认为我的方法是浏览每个文件并检查推文之间是否有换行符(我认为twitter使用\ r \ n对吗?)。如果不存在,我必须添加它......
另外,这是否有原因?这是我的Streamer中的代码的问题(它是一个收集所有内容的node.js脚本)。
这是一个示例数据集(只是一些导致同样问题的代表性推文)......:http://pastebin.com/8AjM6yc2
一些未添加到列表中的示例代码:
{"created_at":"Sun Sep 18 23:58:50 +0000 2016","id":777658170751582200,"id_str":"777658170751582208","text":"nobody tell him i like the packers okay? @ University of Phoenix Stadium www.example.com","source":"<a href=\"http://instagram.com\" rel=\"nofollow\">Instagram</a>","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":35321729,"id_str":"35321729","name":"sammy","screen_name":"guzzzyy","location":"university of arizona '19","url":null,"description":"loves otters and conspiracy theories | ΣK","protected":false,"verified":false,"followers_count":620,"friends_count":306,"listed_count":4,"favourites_count":15824,"statuses_count":20940,"created_at":"Sat Apr 25 21:58:01 +0000 2009","utc_offset":-25200,"time_zone":"Arizona","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FFFFFF","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/447840344499974144/A8FRdFXz.png","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/447840344499974144/A8FRdFXz.png","profile_background_tile":true,"profile_link_color":"DCBBFA","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"ED0043","profile_text_color":"FFFFFF","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/768288586886029315/h5-HBL5y_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/768288586886029315/h5-HBL5y_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/35321729/1473271290","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[33.52812869,-112.26250073]},"coordinates":{"type":"Point","coordinates":[-112.26250073,33.52812869]},"place":{"id":"a612c69b44b2e5da","url":"https://api.twitter.com/1.1/geo/id/a612c69b44b2e5da.json","place_type":"admin","name":"Arizona","full_name":"Arizona, USA","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-114.818269,31.332246],[-114.818269,37.004261],[-109.045153,37.004261],[-109.045153,31.332246]]]},"attributes":{}},"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"www.example.com","expanded_url":"https://www.instagram.com/p/BKhD2tfhOCK/","display_url":"instagram.com/p/BKhD2tfhOCK/","indices":[73,96]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1474243130754"},
{"created_at":"Sun Sep 18 23:58:50 +0000 2016","id":777658171657691100,"id_str":"777658171657691136","text":"Pastor @pastormurph & @zebonperiscope\n@ChangingAGenAtl @ Changing A Generation Full Gospel… www.example.com","source":"<a href=\"http://instagram.com\" rel=\"nofollow\">Instagram</a>","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":157062565,"id_str":"157062565","name":"Lady Tyisha Phillips","screen_name":"TyishaPhillips","location":"Atlanta, GA","url":"http://www.cagmin.org","description":"Young Adult Ministry Co-Pastor","protected":false,"verified":false,"followers_count":643,"friends_count":609,"listed_count":4,"favourites_count":163,"statuses_count":1752,"created_at":"Fri Jun 18 19:02:11 +0000 2010","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"BDB9BD","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/378800000149695120/SnBdvtk3.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/378800000149695120/SnBdvtk3.jpeg","profile_background_tile":true,"profile_link_color":"990000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/666443040052142080/6huKB94N_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/666443040052142080/6huKB94N_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/157062565/1386911172","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[33.68266,-84.49847]},"coordinates":{"type":"Point","coordinates":[-84.49847,33.68266]},"place":{"id":"8173485c72e78ca5","url":"https://api.twitter.com/1.1/geo/id/8173485c72e78ca5.json","place_type":"city","name":"Atlanta","full_name":"Atlanta, GA","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-84.576827,33.647503],[-84.576827,33.886886],[-84.289385,33.886886],[-84.289385,33.647503]]]},"attributes":{}},"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"www.example.com","expanded_url":"https://www.instagram.com/p/BKhD1gHAwKL/","display_url":"instagram.com/p/BKhD1gHAwKL/","indices":[96,119]}],"user_mentions":[{"screen_name":"pastormurph","name":"William Murphy","id":21972711,"id_str":"21972711","indices":[7,19]},{"screen_name":"ZEBonPeriscope","name":"Zebulon Ellis","id":3327209906,"id_str":"3327209906","indices":[26,41]},{"screen_name":"CHANGINGAGENATL","name":"CAGFGBC ATL","id":234880261,"id_str":"234880261","indices":[42,58]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1474243130970"},
{"created_at":"Sun Sep 18 23:58:51 +0000 2016","id":777658172081176600,"id_str":"777658172081176576","text":" @ Villas de la Boca www.example.com","source":"<a href=\"http://instagram.com\" rel=\"nofollow\">Instagram</a>","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":879232754,"id_str":"879232754","name":"Tadeo Martinez","screen_name":"LuizTigres","location":"Instagram","url":"http://Instagram.com/luiztadeo","description":"Hincha de Tigres ⚽️ en las buenas te quiero , en las malas Teamo! Snap:tadeo.mtz","protected":false,"verified":false,"followers_count":148,"friends_count":134,"listed_count":1,"favourites_count":686,"statuses_count":1987,"created_at":"Sun Oct 14 03:31:55 +0000 2012","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/378800000117839880/ae0d0e92e0b9ff5c5b9184636d7a8220.jpeg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/378800000117839880/ae0d0e92e0b9ff5c5b9184636d7a8220.jpeg","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/757612542084599808/DElVfT1O_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/757612542084599808/DElVfT1O_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/879232754/1464846885","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[25.4462465,-100.09724682]},"coordinates":{"type":"Point","coordinates":[-100.09724682,25.4462465]},"place":{"id":"2c05f2ee0a17497d","url":"https://api.twitter.com/1.1/geo/id/2c05f2ee0a17497d.json","place_type":"city","name":"Santiago","full_name":"Santiago, Nuevo León","country_code":"MX","country":"México","bounding_box":{"type":"Polygon","coordinates":[[[-100.530034,25.228247],[-100.530034,25.521547],[-100.028913,25.521547],[-100.028913,25.228247]]]},"attributes":{}},"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"www.example.com","expanded_url":"https://www.instagram.com/p/BKhD2xWhwdMPu8FAx3zoZXoqrgTd-ZrksAR76E0/","display_url":"instagram.com/p/BKhD2xWhwdMP…","indices":[22,45]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"es","timestamp_ms":"1474243131071"},
{"created_at":"Sun Sep 18 23:58:51 +0000 2016","id":777658172458827800,"id_str":"777658172458827776","text":"There's nothing better than finding someone to serve Jesus with ❤️ |… www.example.com","source":"<a href=\"http://instagram.com\" rel=\"nofollow\">Instagram</a>","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":204172559,"id_str":"204172559","name":"Amanda Gutierrez ❁","screen_name":"agutierrez910","location":null,"url":"http://newcreationca.org","description":"Worship Pastor at New Creation Church, Wife to @willspeaks, mommy of 3 boys, & child of an AMAZING God! #winning","protected":false,"verified":false,"followers_count":519,"friends_count":272,"listed_count":10,"favourites_count":1969,"statuses_count":3823,"created_at":"Mon Oct 18 02:45:16 +0000 2010","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"ACDED6","profile_background_image_url":"http://abs.twimg.com/images/themes/theme18/bg.gif","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme18/bg.gif","profile_background_tile":false,"profile_link_color":"038543","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"F6F6F6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/771235854714929152/PD_xu4Od_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/771235854714929152/PD_xu4Od_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/204172559/1472711595","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":{"type":"Point","coordinates":[33.92489,-116.88952]},"coordinates":{"type":"Point","coordinates":[-116.88952,33.92489]},"place":{"id":"792551bc9bd3c992","url":"https://api.twitter.com/1.1/geo/id/792551bc9bd3c992.json","place_type":"city","name":"Banning","full_name":"Banning, CA","country_code":"US","country":"United States","bounding_box":{"type":"Polygon","coordinates":[[[-116.947005,33.902607],[-116.947005,33.94771],[-116.859016,33.94771],[-116.859016,33.902607]]]},"attributes":{}},"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"www.example.com","expanded_url":"https://www.instagram.com/p/BKhD22EAPT04KMmkxbg5yAGTNFsGdty870jAM80/","display_url":"instagram.com/p/BKhD22EAPT04…","indices":[70,93]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1474243131161"},
我使用的代码:
from json import JSONDecoder
from functools import partial
import os
import json
import io
import csv
def json_parse(file, decoder=JSONDecoder(), buffersize=2048):
buffer = ''
for chunk in iter(partial(file.read, buffersize), ''):
buffer += chunk
while buffer:
try:
#print("success")
result, index = decoder.raw_decode(buffer)
yield result
buffer = buffer[index:]
except ValueError:
#print("fail")
break
dataset = []
for file in os.listdir():
#print(file)
if(file.startswith("twitter")):
with open(file, 'r', encoding='utf-8') as infh:
for data in json_parse(infh):
dataset.append(data)
print(len(dataset))