我一直在使用来自streamR包的函数parseTweets为R而且效果很好,但是当"读取"通过readTweets函数的推文,我遇到了一些未被解析的变量(例如,用户背景颜色)。
在查看代码时,只是添加被忽略的变量似乎非常简单,但是当我这样做时,写入数据框似乎跳过这些" new"变量。我已经调试了变量本身并且它们确实有效。任何人都可以看到我失踪的东西!?
parseTweet_more <- function(tweets, simplify=FALSE, verbose=TRUE){
## from json to list
results.list <- readTweets(tweets, verbose=FALSE)
# if no text in list, change it to NULL
if (length(results.list)==0){
stop(deparse(substitute(tweets)), " did not contain any tweets. ",
"See ?parseTweets for more details.")
}
# constructing data frame with tweet and user variable
df <- data.frame(
text = unlistWithNA(results.list, 'text'),
screen_name = unlistWithNA(results.list, c('user', 'screen_name')),
retweet_count = unlistWithNA(results.list, c('retweeted_status', 'retweet_count')),
favorited = unlistWithNA(results.list, 'favorited'),
truncated = unlistWithNA(results.list, 'truncated'),
hola='de huebos',
id_str = unlistWithNA(results.list, 'id_str'),
in_reply_to_screen_name = unlistWithNA(results.list, 'in_reply_to_screen_name'),
source = unlistWithNA(results.list, 'source'),
retweeted = unlistWithNA(results.list, 'retweeted'),
created_at = unlistWithNA(results.list, 'created_at'),
in_reply_to_status_id_str = unlistWithNA(results.list, 'in_reply_to_status_id_str'),
in_reply_to_user_id_str = unlistWithNA(results.list, 'in_reply_to_user_id_str'),
lang = unlistWithNA(results.list, 'lang'),
listed_count = unlistWithNA(results.list, c('user', 'listed_count')),
verified = unlistWithNA(results.list, c('user', 'verified')),
location = unlistWithNA(results.list, c('user', 'location')),
user_id_str = unlistWithNA(results.list, c('user', 'id_str')),
description = unlistWithNA(results.list, c('user', 'description')),
geo_enabled = unlistWithNA(results.list, c('user', 'geo_enabled')),
user_created_at = unlistWithNA(results.list, c('user', 'created_at')),
statuses_count = unlistWithNA(results.list, c('user', 'statuses_count')),
followers_count = unlistWithNA(results.list, c('user', 'followers_count')),
favourites_count = unlistWithNA(results.list, c('user', 'favourites_count')),
protected = unlistWithNA(results.list, c('user', 'protected')),
user_url = unlistWithNA(results.list, c('user', 'url')),
name = unlistWithNA(results.list, c('user', 'name')),
time_zone = unlistWithNA(results.list, c('user', 'time_zone')),
user_lang = unlistWithNA(results.list, c('user', 'lang')),
utc_offset = unlistWithNA(results.list, c('user', 'utc_offset')),
following_count = unlistWithNA(results.list, c('user', 'friends_count')),
is_translation_enabled = unlistWithNA(results.list,c('user','is_translation_enabled')),
#THIS DOESN'T "WRITE" INTO DATA FRAME!
profile_background_color = unlistWithNA(results.list, c('user','profile_background_color')),
profile_image_url = unlistWithNA(results.list, c('user','profile_image_url'),
profile_link_color = unlistWithNA(results.list, c('user','profile_link_color')),
profile_sidebar_border_color = unlistWithNA(results.list, c('user','profile_sidebar_border_color')),
profile_sidebar_fill_color = unlistWithNA(results.list, c('user','profile_sidebar_fill_color')),
profile_text_color = unlistWithNA(results.list, c('user','profile_text_color')),
following = unlistWithNA(results.list, c('user','following')),
stringsAsFactors=F)
# retweet_count is extracted from retweeted_status. If this is not a RT, set to zero
df$retweet_count[is.na(df$retweet_count)] <- 0
# adding geographic variables and url entities
if (simplify==FALSE){
df$country_code <- unlistWithNA(results.list, c('place', 'country_code'))
df$country <- unlistWithNA(results.list, c('place', 'country'))
df$place_type <- unlistWithNA(results.list, c('place', 'place_type'))
df$full_name <- unlistWithNA(results.list, c('place', 'full_name'))
df$place_name <- unlistWithNA(results.list, c('place', 'place_name'))
df$place_id <- unlistWithNA(results.list, c('place', 'place_id'))
place_lat_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 2))
place_lat_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 2, 2))
df$place_lat <- sapply(1:length(results.list), function(x)
mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
place_lon_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 1))
place_lon_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 3, 1))
df$place_lon <- sapply(1:length(results.list), function(x)
mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
df$lat <- unlistWithNA(results.list, c('geo', 'coordinates', 1))
df$lon <- unlistWithNA(results.list, c('geo', 'coordinates', 2))
df$expanded_url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'expanded_url'))
df$url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'url'))
}
# information message
if (verbose==TRUE) cat(length(df$text), "tweets have been parsed.", "\n")
return(df)
}
unlistWithNA <- function(lst, field){
if (length(field)==1){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
}
if (length(field)==2){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
}
if (length(field)==3 & field[1]!="geo"){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]]))
}
if (field[1]=="geo"){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]]))
}
if (length(field)==4 && field[2]!="urls"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]]))
}
if (length(field)==4 && field[2]=="urls"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]]))
}
if (length(field)==6 && field[2]=="bounding_box"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x)
x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]]))
}
return(vect)
}
答案 0 :(得分:1)
我无法重现您的问题,但是使用来自data(example_tweets)
和streamR
的{{1}}包中的以下代码,您提到的变量已被解析
<强>代码:强>
#load package for sample tweets
require(streamR)
data(example_tweets)
#download code from above link and add 'profile_...' variables
parseTweets_new <- function(tweets, simplify=FALSE, verbose=TRUE){
## from json to list
results.list <- readTweets(tweets, verbose=FALSE)
# if no text in list, change it to NULL
if (length(results.list)==0){
stop(deparse(substitute(tweets)), " did not contain any tweets. ",
"See ?parseTweets for more details.")
}
# constructing data frame with tweet and user variable
df <- data.frame(
text = unlistWithNA(results.list, 'text'),
retweet_count = unlistWithNA(results.list, c('retweeted_status', 'retweet_count')),
favorited = unlistWithNA(results.list, 'favorited'),
truncated = unlistWithNA(results.list, 'truncated'),
id_str = unlistWithNA(results.list, 'id_str'),
in_reply_to_screen_name = unlistWithNA(results.list, 'in_reply_to_screen_name'),
source = unlistWithNA(results.list, 'source'),
retweeted = unlistWithNA(results.list, 'retweeted'),
created_at = unlistWithNA(results.list, 'created_at'),
in_reply_to_status_id_str = unlistWithNA(results.list, 'in_reply_to_status_id_str'),
in_reply_to_user_id_str = unlistWithNA(results.list, 'in_reply_to_user_id_str'),
lang = unlistWithNA(results.list, 'lang'),
listed_count = unlistWithNA(results.list, c('user', 'listed_count')),
verified = unlistWithNA(results.list, c('user', 'verified')),
location = unlistWithNA(results.list, c('user', 'location')),
user_id_str = unlistWithNA(results.list, c('user', 'id_str')),
description = unlistWithNA(results.list, c('user', 'description')),
geo_enabled = unlistWithNA(results.list, c('user', 'geo_enabled')),
user_created_at = unlistWithNA(results.list, c('user', 'created_at')),
statuses_count = unlistWithNA(results.list, c('user', 'statuses_count')),
followers_count = unlistWithNA(results.list, c('user', 'followers_count')),
favourites_count = unlistWithNA(results.list, c('user', 'favourites_count')),
protected = unlistWithNA(results.list, c('user', 'protected')),
user_url = unlistWithNA(results.list, c('user', 'url')),
name = unlistWithNA(results.list, c('user', 'name')),
time_zone = unlistWithNA(results.list, c('user', 'time_zone')),
user_lang = unlistWithNA(results.list, c('user', 'lang')),
utc_offset = unlistWithNA(results.list, c('user', 'utc_offset')),
friends_count = unlistWithNA(results.list, c('user', 'friends_count')),
screen_name = unlistWithNA(results.list, c('user', 'screen_name')),
#Added new variables here
profile_background_color = unlistWithNA(results.list, c('user', "profile_background_color")),
profile_background_image_url = unlistWithNA(results.list, c('user', "profile_background_image_url")),
profile_background_image_url_https = unlistWithNA(results.list, c('user', "profile_background_image_url_https")),
profile_image_url = unlistWithNA(results.list, c('user', "profile_image_url")),
profile_image_url_https = unlistWithNA(results.list, c('user', "profile_image_url_https")),
profile_banner_url = unlistWithNA(results.list, c('user', "profile_banner_url")),
following = unlistWithNA(results.list, c('user', "following")),
follow_request_sent = unlistWithNA(results.list, c('user', "follow_request_sent")),
notifications = unlistWithNA(results.list, c('user', "notifications")),
stringsAsFactors=F)
# retweet_count is extracted from retweeted_status. If this is not a RT, set to zero
df$retweet_count[is.na(df$retweet_count)] <- 0
# adding geographic variables and url entities
if (simplify==FALSE){
df$country_code <- unlistWithNA(results.list, c('place', 'country_code'))
df$country <- unlistWithNA(results.list, c('place', 'country'))
df$place_type <- unlistWithNA(results.list, c('place', 'place_type'))
df$full_name <- unlistWithNA(results.list, c('place', 'full_name'))
df$place_name <- unlistWithNA(results.list, c('place', 'place_name'))
df$place_id <- unlistWithNA(results.list, c('place', 'place_id'))
place_lat_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 2))
place_lat_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 2, 2))
df$place_lat <- sapply(1:length(results.list), function(x)
mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
place_lon_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 1))
place_lon_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 3, 1))
df$place_lon <- sapply(1:length(results.list), function(x)
mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
df$lat <- unlistWithNA(results.list, c('geo', 'coordinates', 1))
df$lon <- unlistWithNA(results.list, c('geo', 'coordinates', 2))
df$expanded_url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'expanded_url'))
df$url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'url'))
}
# information message
if (verbose==TRUE) cat(length(df$text), "tweets have been parsed.", "\n")
return(df)
}
unlistWithNA <- function(lst, field){
if (length(field)==1){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
}
if (length(field)==2){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
}
if (length(field)==3 & field[1]!="geo"){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]]))
}
if (field[1]=="geo"){
notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]]))
}
if (length(field)==4 && field[2]!="urls"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]]))
}
if (length(field)==4 && field[2]=="urls"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]]))
}
if (length(field)==6 && field[2]=="bounding_box"){
notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
vect <- rep(NA, length(lst))
vect[notnulls] <- unlist(lapply(lst[notnulls], function(x)
x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]]))
}
return(vect)
}
readTweets <- function(tweets, verbose=TRUE){
## checking input is correct
if (is.null(tweets)){
stop("Error: you need to specify file or object where tweets text was stored.")
}
## Read the text file and save it in memory as a list
if (length(tweets)==1 && file.exists(tweets)){
lines <- readLines(tweets, encoding="UTF-8")
}
else {
lines <- tweets
}
results.list <- lapply(lines[nchar(lines)>0], function(x) tryCatch(fromJSON(x), error=function(e) e))
## removing lines that do not contain tweets or were not properly parsed
errors <- which(unlist(lapply(results.list, length))<18)
if (length(errors)>0){
results.list <- results.list[-errors]
}
# information message
if (verbose==TRUE) cat(length(results.list), "tweets have been parsed.", "\n")
return(results.list)
}
<强>输出:强>
#Parse tweets, displaying only 2 columns for space constraints
tweets.df<-parseTweets_new(example_tweets)
#> tweets.df[,grep('profile',colnames(tweets.df))[1:2]]
# profile_background_color profile_background_image_url
#1 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#2 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#3 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#4 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#5 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#6 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#7 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#8 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#9 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#10 C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
正如我们所看到的,此代码适用于添加新变量,unlistWithNA(results.list, c('user', "profile_background_color"))
的输出是什么。我怀疑您的推文数据存在问题,请告诉我们......