我有一个示例JSON文件,其中有大约500条推文,我想进入数据框。
来自JSON文件的前三条推文如下(网址已被故意更改以适应链接上的stackoverflow规则):
{"id":"tag:search.twitter.com,2005:413500801899044864","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:860787127","link":"httpee://www.twitter.com/JoeGoodman11","displayName":"Joe Goodman","postedTime":"2012-10-04T03:18:54.000Z","image":"httpes://pbs.twimg.com/profile_images/3781305408/372be07ac2b312d35e1426b264891c4f_normal.jpeg","summary":null,"links":[{"href":null,"rel":"me"}],"friendsCount":21,"followersCount":18,"listedCount":0,"statusesCount":177,"twitterTimeZone":null,"verified":false,"utcOffset":null,"preferredUsername":"JoeGoodman11","languages":["en"],"favoritesCount":286},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for Android","link":"httpee://twitter.com/download/android"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/JoeGoodman11/statuses/413500801899044864","body":"Hard at work studying for finals httpee://t.co/0EumsvUCuI","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500801899044864","summary":"Hard at work studying for finals httpee://t.co/0EumsvUCuI","link":"httpee://twitter.com/JoeGoodman11/statuses/413500801899044864","postedTime":"2013-12-19T02:47:28.000Z"},"favoritesCount":0,"location":{"objectType":"place","displayName":"Lowell, MA","name":"Lowell","country_code":"United States","twitter_country_code":"US","link":"httpes://api.twitter.com/1.1/geo/id/d6539f049c4d05e8.json","geo":{"type":"Polygon","coordinates":[[[-71.382491,42.607189],[-71.382491,42.66676],[-71.271231,42.66676],[-71.271231,42.607189]]]}},"geo":{"type":"Point","coordinates":[42.6428357,-71.33654]},"twitter_entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[],"media":[{"id":413500801395736576,"id_str":"413500801395736576","indices":[33,55],"media_url":"httpee://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg","media_url_https":"httpes://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg","url":"httpee://t.co/0EumsvUCuI","display_url":"pic.twitter.com/0EumsvUCuI","expanded_url":"httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1","type":"photo","sizes":{"medium":{"w":600,"h":339,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":340,"h":192,"resize":"fit"},"large":{"w":1023,"h":579,"resize":"fit"}}}]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"urls":[{"url":"httpee://t.co/0EumsvUCuI","expanded_url":"httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1","expanded_status":200}],"language":{"value":"en"}}}
{"id":"tag:search.twitter.com,2005:413500803593547776","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:168228121","link":"httpee://www.twitter.com/rvzigvdhiv","displayName":"Razi الرازي Gadhia","postedTime":"2010-07-18T19:28:45.000Z","image":"httpes://pbs.twimg.com/profile_images/412269827399495680/44JZWZPz_normal.jpeg","summary":"Why so serious? \n#2005spellingbeechamp \n#wood","links":[{"href":null,"rel":"me"}],"friendsCount":196,"followersCount":300,"listedCount":0,"statusesCount":4236,"twitterTimeZone":"Eastern Time (US & Canada)","verified":false,"utcOffset":"-18000","preferredUsername":"rvzigvdhiv","languages":["en"],"location":{"objectType":"place","displayName":"ATL"},"favoritesCount":4316},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for iPhone","link":"http://twitter.com/download/iphone"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/rvzigvdhiv/statuses/413500803593547776","body":"@thellymon haha aight homie I'll let you know","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500803593547776","summary":"@thellymon haha aight homie I'll let you know","link":"httpee://twitter.com/rvzigvdhiv/statuses/413500803593547776","postedTime":"2013-12-19T02:47:28.000Z"},"inReplyTo":{"link":"httpee://twitter.com/thellymon/statuses/413500370695229441"},"favoritesCount":0,"twitter_entities":{"hashtags":[],"symbols":[],"urls":[],"user_mentions":[{"screen_name":"thellymon","name":"","id":920010534,"id_str":"920010534","indices":[0,10]}]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"language":{"value":"en"},"profileLocations":[{"objectType":"place","geo":{"type":"point","coordinates":[-84.38798,33.749]},"address":{"country":"United States","countryCode":"US","locality":"Atlanta","region":"Georgia","subRegion":"Fulton County"},"displayName":"Atlanta, Georgia, United States"}]}}
{"id":"tag:search.twitter.com,2005:413500803597758464","objectType":"activity","actor":{"objectType":"person","id":"id:twitter.com:394373858","link":"httpee://www.twitter.com/Carly_Horse12","displayName":"Carly Sawyer","postedTime":"2011-10-19T23:56:56.000Z","image":"httpes://pbs.twimg.com/profile_images/378800000497869250/84266ccaf047be0cfbd8aeb73fe88544_normal.jpeg","summary":"Lindy Hopper. Theatre geek. Biology nerd. Christ follower. Creation lover. Dream chaser.","links":[{"href":null,"rel":"me"}],"friendsCount":398,"followersCount":197,"listedCount":1,"statusesCount":3220,"twitterTimeZone":"Quito","verified":false,"utcOffset":"-18000","preferredUsername":"Carly_Horse12","languages":["en"],"location":{"objectType":"place","displayName":"Charlottesville, VA"},"favoritesCount":662},"verb":"post","postedTime":"2013-12-19T02:47:28.000Z","generator":{"displayName":"Twitter for iPhone","link":"httpee://twitter.com/download/iphone"},"provider":{"objectType":"service","displayName":"Twitter","link":"httpee://www.twitter.com"},"link":"httpee://twitter.com/Carly_Horse12/statuses/413500803597758464","body":"And this concludes the yearly screening of \"It's A Wonder Life\" in it's usual fashion with Mom and me in shambles #tears","object":{"objectType":"note","id":"object:search.twitter.com,2005:413500803597758464","summary":"And this concludes the yearly screening of \"It's A Wonder Life\" in it's usual fashion with Mom and me in shambles #tears","link":"httpee://twitter.com/Carly_Horse12/statuses/413500803597758464","postedTime":"2013-12-19T02:47:28.000Z"},"favoritesCount":0,"twitter_entities":{"hashtags":[{"text":"tears","indices":[114,120]}],"symbols":[],"urls":[],"user_mentions":[]},"twitter_filter_level":"medium","twitter_lang":"en","retweetCount":0,"gnip":{"language":{"value":"en"},"profileLocations":[{"objectType":"place","geo":{"type":"point","coordinates":[-78.47668,38.02931]},"address":{"country":"United States","countryCode":"US","locality":"Charlottesville","region":"Virginia","subRegion":"City of Charlottesville"},"displayName":"Charlottesville, Virginia, United States"}]}}
我使用以下R脚本:
library(rjson)
library(RCurl)
library(plyr)
raw_data<-('*filepath*/JSON test.json')
data<-fromJSON(paste(readLines(raw_data),collapse=""))
data
tweets<-data$body
tweets
产生以下结果 - 我只得到第一条推文的数据
data<-fromJSON(paste(readLines(raw_data),collapse=""))
data
$id
[1] "tag:search.twitter.com,2005:413500801899044864"
$objectType
[1] "activity"
$actor
$actor$objectType
[1] "person"
$actor$id
[1] "id:twitter.com:860787127"
$actor$link
[1] "httpee://www.twitter.com/JoeGoodman11"
$actor$displayName
[1] "Joe Goodman"
$actor$postedTime
[1] "2012-10-04T03:18:54.000Z"
$actor$image
[1] "httpes://pbs.twimg.com/profile_images/3781305408/372be07ac2b312d35e1426b264891c4f_normal.jpeg"
$actor$summary
NULL
$actor$links
$actor$links[[1]]
$actor$links[[1]]$href
NULL
$actor$links[[1]]$rel
[1] "me"
$actor$friendsCount
[1] 21
$actor$followersCount
[1] 18
$actor$listedCount
[1] 0
$actor$statusesCount
[1] 177
$actor$twitterTimeZone
NULL
$actor$verified
[1] FALSE
$actor$utcOffset
NULL
$actor$preferredUsername
[1] "JoeGoodman11"
$actor$languages
[1] "en"
$actor$favoritesCount
[1] 286
$verb
[1] "post"
$postedTime
[1] "2013-12-19T02:47:28.000Z"
$generator
$generator$displayName
[1] "Twitter for Android"
$generator$link
[1] "httpee://twitter.com/download/android"
$provider
$provider$objectType
[1] "service"
$provider$displayName
[1] "Twitter"
$provider$link
[1] "httpee://www.twitter.com"
$link
[1] "httpee://twitter.com/JoeGoodman11/statuses/413500801899044864"
$body
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"
$object
$object$objectType
[1] "note"
$object$id
[1] "object:search.twitter.com,2005:413500801899044864"
$object$summary
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"
$object$link
[1] "httpee://twitter.com/JoeGoodman11/statuses/413500801899044864"
$object$postedTime
[1] "2013-12-19T02:47:28.000Z"
$favoritesCount
[1] 0
$location
$location$objectType
[1] "place"
$location$displayName
[1] "Lowell, MA"
$location$name
[1] "Lowell"
$location$country_code
[1] "United States"
$location$twitter_country_code
[1] "US"
$location$link
[1] "httpes://api.twitter.com/1.1/geo/id/d6539f049c4d05e8.json"
$location$geo
$location$geo$type
[1] "Polygon"
$location$geo$coordinates
$location$geo$coordinates[[1]]
$location$geo$coordinates[[1]][[1]]
[1] -71.38249 42.60719
$location$geo$coordinates[[1]][[2]]
[1] -71.38249 42.66676
$location$geo$coordinates[[1]][[3]]
[1] -71.27123 42.66676
$location$geo$coordinates[[1]][[4]]
[1] -71.27123 42.60719
$geo
$geo$type
[1] "Point"
$geo$coordinates
[1] 42.64284 -71.33654
$twitter_entities
$twitter_entities$hashtags
list()
$twitter_entities$symbols
list()
$twitter_entities$urls
list()
$twitter_entities$user_mentions
list()
$twitter_entities$media
$twitter_entities$media[[1]]
$twitter_entities$media[[1]]$id
[1] 4.135008e+17
$twitter_entities$media[[1]]$id_str
[1] "413500801395736576"
$twitter_entities$media[[1]]$indices
[1] 33 55
$twitter_entities$media[[1]]$media_url
[1] "httpee://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg"
$twitter_entities$media[[1]]$media_url_https
[1] "httpes://pbs.twimg.com/media/Bb0Myb2IQAAaexg.jpg"
$twitter_entities$media[[1]]$url
[1] "httpee://t.co/0EumsvUCuI"
$twitter_entities$media[[1]]$display_url
[1] "pic.twitter.com/0EumsvUCuI"
$twitter_entities$media[[1]]$expanded_url
[1] "httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1"
$twitter_entities$media[[1]]$type
[1] "photo"
$twitter_entities$media[[1]]$sizes
$twitter_entities$media[[1]]$sizes$medium
$twitter_entities$media[[1]]$sizes$medium$w
[1] 600
$twitter_entities$media[[1]]$sizes$medium$h
[1] 339
$twitter_entities$media[[1]]$sizes$medium$resize
[1] "fit"
$twitter_entities$media[[1]]$sizes$thumb
$twitter_entities$media[[1]]$sizes$thumb$w
[1] 150
$twitter_entities$media[[1]]$sizes$thumb$h
[1] 150
$twitter_entities$media[[1]]$sizes$thumb$resize
[1] "crop"
$twitter_entities$media[[1]]$sizes$small
$twitter_entities$media[[1]]$sizes$small$w
[1] 340
$twitter_entities$media[[1]]$sizes$small$h
[1] 192
$twitter_entities$media[[1]]$sizes$small$resize
[1] "fit"
$twitter_entities$media[[1]]$sizes$large
$twitter_entities$media[[1]]$sizes$large$w
[1] 1023
$twitter_entities$media[[1]]$sizes$large$h
[1] 579
$twitter_entities$media[[1]]$sizes$large$resize
[1] "fit"
$twitter_filter_level
[1] "medium"
$twitter_lang
[1] "en"
$retweetCount
[1] 0
$gnip
$gnip$urls
$gnip$urls[[1]]
$gnip$urls[[1]]$url
[1] "httpee://t.co/0EumsvUCuI"
$gnip$urls[[1]]$expanded_url
[1] "httpee://twitter.com/JoeGoodman11/status/413500801899044864/photo/1"
$gnip$urls[[1]]$expanded_status
[1] 200
$gnip$language
$gnip$language$value
[1] "en"
和
tweets<-data$body
tweets
[1] "Hard at work studying for finals http://t.co/0EumsvUCuI"
目的是让推文显示所有500条推文的正文字段。非常感谢任何帮助!
答案 0 :(得分:1)
您的paste
调用只是在不插入正确的json分隔符的情况下连接各行。如果您有类似
data <- fromJSON(sprintf("[%s]", paste(readLines(raw_data),collapse=",")))
然后用逗号分隔各行,整个事物将用json的方括号表示形式包装对象数组。然后,您可以从数据数组的每个元素中提取顶级属性
bodies <- sapply(data, "[[", "body")