将变量添加到streamR包中的parseTweet

时间:2014-06-19 20:03:25

标签: r parsing twitter dataframe

我一直在使用来自streamR包的函数parseTweets为R而且效果很好,但是当"读取"通过readTweets函数的推文,我遇到了一些未被解析的变量(例如,用户背景颜色)。

在查看代码时,只是添加被忽略的变量似乎非常简单,但是当我这样做时,写入数据框似乎跳过这些" new"变量。我已经调试了变量本身并且它们确实有效。任何人都可以看到我失踪的东西!?

     parseTweet_more <- function(tweets, simplify=FALSE, verbose=TRUE){

    ## from json to list
    results.list <- readTweets(tweets, verbose=FALSE)

    # if no text in list, change it to NULL
    if (length(results.list)==0){
        stop(deparse(substitute(tweets)), " did not contain any tweets. ",
            "See ?parseTweets for more details.")
    }

    # constructing data frame with tweet and user variable
    df <- data.frame(
        text = unlistWithNA(results.list, 'text'),
        screen_name = unlistWithNA(results.list, c('user', 'screen_name')),
        retweet_count = unlistWithNA(results.list, c('retweeted_status', 'retweet_count')),
        favorited = unlistWithNA(results.list, 'favorited'),
        truncated = unlistWithNA(results.list, 'truncated'),
        hola='de huebos',
        id_str = unlistWithNA(results.list, 'id_str'),
        in_reply_to_screen_name = unlistWithNA(results.list, 'in_reply_to_screen_name'),
        source = unlistWithNA(results.list, 'source'),
        retweeted = unlistWithNA(results.list, 'retweeted'),
        created_at = unlistWithNA(results.list, 'created_at'),
        in_reply_to_status_id_str = unlistWithNA(results.list, 'in_reply_to_status_id_str'),
        in_reply_to_user_id_str = unlistWithNA(results.list, 'in_reply_to_user_id_str'),
        lang = unlistWithNA(results.list, 'lang'),
        listed_count = unlistWithNA(results.list, c('user', 'listed_count')),
        verified = unlistWithNA(results.list, c('user', 'verified')),
        location = unlistWithNA(results.list, c('user', 'location')),
        user_id_str = unlistWithNA(results.list, c('user', 'id_str')),
        description = unlistWithNA(results.list, c('user', 'description')),
        geo_enabled = unlistWithNA(results.list, c('user', 'geo_enabled')),
        user_created_at = unlistWithNA(results.list, c('user', 'created_at')),
        statuses_count = unlistWithNA(results.list, c('user', 'statuses_count')),
        followers_count = unlistWithNA(results.list, c('user', 'followers_count')),
        favourites_count = unlistWithNA(results.list, c('user', 'favourites_count')),
        protected = unlistWithNA(results.list, c('user', 'protected')),
        user_url = unlistWithNA(results.list, c('user', 'url')),
        name = unlistWithNA(results.list, c('user', 'name')),
        time_zone = unlistWithNA(results.list, c('user', 'time_zone')),
        user_lang = unlistWithNA(results.list, c('user', 'lang')),
        utc_offset = unlistWithNA(results.list, c('user', 'utc_offset')),
        following_count = unlistWithNA(results.list, c('user', 'friends_count')),
        is_translation_enabled = unlistWithNA(results.list,c('user','is_translation_enabled')),

        #THIS DOESN'T "WRITE" INTO DATA FRAME!
        profile_background_color = unlistWithNA(results.list, c('user','profile_background_color')),
        profile_image_url = unlistWithNA(results.list, c('user','profile_image_url'),
        profile_link_color = unlistWithNA(results.list, c('user','profile_link_color')),
        profile_sidebar_border_color = unlistWithNA(results.list, c('user','profile_sidebar_border_color')),
        profile_sidebar_fill_color = unlistWithNA(results.list, c('user','profile_sidebar_fill_color')),
        profile_text_color = unlistWithNA(results.list, c('user','profile_text_color')), 
        following = unlistWithNA(results.list, c('user','following')),
        stringsAsFactors=F)

    # retweet_count is extracted from retweeted_status. If this is not a RT, set to zero
    df$retweet_count[is.na(df$retweet_count)] <- 0

    # adding geographic variables and url entities
    if (simplify==FALSE){
        df$country_code <- unlistWithNA(results.list, c('place', 'country_code'))
        df$country <- unlistWithNA(results.list, c('place', 'country'))
        df$place_type <- unlistWithNA(results.list, c('place', 'place_type'))
        df$full_name <- unlistWithNA(results.list, c('place', 'full_name'))
        df$place_name <- unlistWithNA(results.list, c('place', 'place_name'))
        df$place_id <- unlistWithNA(results.list, c('place', 'place_id'))
        place_lat_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 2))
        place_lat_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 2, 2))
        df$place_lat <- sapply(1:length(results.list), function(x) 
            mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
        place_lon_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 1))
        place_lon_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 3, 1))
        df$place_lon <- sapply(1:length(results.list), function(x) 
            mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
        df$lat <- unlistWithNA(results.list, c('geo', 'coordinates', 1))
        df$lon <- unlistWithNA(results.list, c('geo', 'coordinates', 2))
        df$expanded_url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'expanded_url'))
        df$url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'url'))
            }

    # information message
    if (verbose==TRUE) cat(length(df$text), "tweets have been parsed.", "\n")
    return(df)
}


unlistWithNA <- function(lst, field){
    if (length(field)==1){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
    }
    if (length(field)==2){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
    }
    if (length(field)==3 & field[1]!="geo"){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]]))
    }
    if (field[1]=="geo"){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]]))
    }

    if (length(field)==4 && field[2]!="urls"){
        notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]]))
    }
    if (length(field)==4 && field[2]=="urls"){
        notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]]))
    }
    if (length(field)==6 && field[2]=="bounding_box"){
        notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) 
            x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]]))
    }
    return(vect)
}

1 个答案:

答案 0 :(得分:1)

我无法重现您的问题,但是使用来自data(example_tweets)streamR的{​​{1}}包中的以下代码,您提到的变量已被解析

<强>代码:

#load package for sample tweets
require(streamR)
data(example_tweets)

#download code from above link and add 'profile_...' variables

parseTweets_new <- function(tweets, simplify=FALSE, verbose=TRUE){

    ## from json to list
    results.list <- readTweets(tweets, verbose=FALSE)

    # if no text in list, change it to NULL
    if (length(results.list)==0){
        stop(deparse(substitute(tweets)), " did not contain any tweets. ",
            "See ?parseTweets for more details.")
    }

    # constructing data frame with tweet and user variable
    df <- data.frame(
        text = unlistWithNA(results.list, 'text'),
        retweet_count = unlistWithNA(results.list, c('retweeted_status', 'retweet_count')),
        favorited = unlistWithNA(results.list, 'favorited'),
        truncated = unlistWithNA(results.list, 'truncated'),
        id_str = unlistWithNA(results.list, 'id_str'),
        in_reply_to_screen_name = unlistWithNA(results.list, 'in_reply_to_screen_name'),
        source = unlistWithNA(results.list, 'source'),
        retweeted = unlistWithNA(results.list, 'retweeted'),
        created_at = unlistWithNA(results.list, 'created_at'),
        in_reply_to_status_id_str = unlistWithNA(results.list, 'in_reply_to_status_id_str'),
        in_reply_to_user_id_str = unlistWithNA(results.list, 'in_reply_to_user_id_str'),
        lang = unlistWithNA(results.list, 'lang'),
        listed_count = unlistWithNA(results.list, c('user', 'listed_count')),
        verified = unlistWithNA(results.list, c('user', 'verified')),
        location = unlistWithNA(results.list, c('user', 'location')),
        user_id_str = unlistWithNA(results.list, c('user', 'id_str')),
        description = unlistWithNA(results.list, c('user', 'description')),
        geo_enabled = unlistWithNA(results.list, c('user', 'geo_enabled')),
        user_created_at = unlistWithNA(results.list, c('user', 'created_at')),
        statuses_count = unlistWithNA(results.list, c('user', 'statuses_count')),
        followers_count = unlistWithNA(results.list, c('user', 'followers_count')),
        favourites_count = unlistWithNA(results.list, c('user', 'favourites_count')),
        protected = unlistWithNA(results.list, c('user', 'protected')),
        user_url = unlistWithNA(results.list, c('user', 'url')),
        name = unlistWithNA(results.list, c('user', 'name')),
        time_zone = unlistWithNA(results.list, c('user', 'time_zone')),
        user_lang = unlistWithNA(results.list, c('user', 'lang')),
        utc_offset = unlistWithNA(results.list, c('user', 'utc_offset')),
        friends_count = unlistWithNA(results.list, c('user', 'friends_count')),
        screen_name = unlistWithNA(results.list, c('user', 'screen_name')),

        #Added new variables here

        profile_background_color = unlistWithNA(results.list, c('user', "profile_background_color")),
        profile_background_image_url = unlistWithNA(results.list, c('user', "profile_background_image_url")),
        profile_background_image_url_https = unlistWithNA(results.list, c('user', "profile_background_image_url_https")),
        profile_image_url = unlistWithNA(results.list, c('user', "profile_image_url")),
        profile_image_url_https = unlistWithNA(results.list, c('user', "profile_image_url_https")),
        profile_banner_url = unlistWithNA(results.list, c('user', "profile_banner_url")),
        following = unlistWithNA(results.list, c('user', "following")),
        follow_request_sent = unlistWithNA(results.list, c('user', "follow_request_sent")),
        notifications = unlistWithNA(results.list, c('user', "notifications")),
        stringsAsFactors=F)

    # retweet_count is extracted from retweeted_status. If this is not a RT, set to zero
    df$retweet_count[is.na(df$retweet_count)] <- 0

    # adding geographic variables and url entities
    if (simplify==FALSE){
        df$country_code <- unlistWithNA(results.list, c('place', 'country_code'))
        df$country <- unlistWithNA(results.list, c('place', 'country'))
        df$place_type <- unlistWithNA(results.list, c('place', 'place_type'))
        df$full_name <- unlistWithNA(results.list, c('place', 'full_name'))
        df$place_name <- unlistWithNA(results.list, c('place', 'place_name'))
        df$place_id <- unlistWithNA(results.list, c('place', 'place_id'))
        place_lat_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 2))
        place_lat_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 2, 2))
        df$place_lat <- sapply(1:length(results.list), function(x)
            mean(c(place_lat_1[x], place_lat_2[x]), na.rm=TRUE))
        place_lon_1 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 1, 1))
        place_lon_2 <- unlistWithNA(results.list, c('place', 'bounding_box', 'coordinates', 1, 3, 1))
        df$place_lon <- sapply(1:length(results.list), function(x)
            mean(c(place_lon_1[x], place_lon_2[x]), na.rm=TRUE))
        df$lat <- unlistWithNA(results.list, c('geo', 'coordinates', 1))
        df$lon <- unlistWithNA(results.list, c('geo', 'coordinates', 2))
        df$expanded_url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'expanded_url'))
        df$url <- unlistWithNA(results.list, c('entities', 'urls', 1, 'url'))

    }

    # information message
    if (verbose==TRUE) cat(length(df$text), "tweets have been parsed.", "\n")
    return(df)
}


unlistWithNA <- function(lst, field){
    if (length(field)==1){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], '[[', field))
    }
    if (length(field)==2){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]]))
    }
    if (length(field)==3 & field[1]!="geo"){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]][[field[3]]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]]))
    }
    if (field[1]=="geo"){
        notnulls <- unlist(lapply(lst, function(x) !is.null(x[[field[1]]][[field[2]]])))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]]))
    }

    if (length(field)==4 && field[2]!="urls"){
        notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]][[field[3]]][[field[4]]])>0))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[field[3]]][[field[4]]]))
    }
    if (length(field)==4 && field[2]=="urls"){
        notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x) x[[field[1]]][[field[2]]][[as.numeric(field[3])]][[field[4]]]))
    }
    if (length(field)==6 && field[2]=="bounding_box"){
        notnulls <- unlist(lapply(lst, function(x) length(x[[field[1]]][[field[2]]])>0))
        vect <- rep(NA, length(lst))
        vect[notnulls] <- unlist(lapply(lst[notnulls], function(x)
            x[[field[1]]][[field[2]]][[field[3]]][[as.numeric(field[4])]][[as.numeric(field[5])]][[as.numeric(field[6])]]))
    }
    return(vect)
}

readTweets <- function(tweets, verbose=TRUE){
    ## checking input is correct
    if (is.null(tweets)){
        stop("Error: you need to specify file or object where tweets text was stored.")
    }

    ## Read the text file and save it in memory as a list
    if (length(tweets)==1 && file.exists(tweets)){
        lines <- readLines(tweets, encoding="UTF-8")
    }
    else {
        lines <- tweets
    }

    results.list <- lapply(lines[nchar(lines)>0], function(x) tryCatch(fromJSON(x), error=function(e) e))

    ## removing lines that do not contain tweets or were not properly parsed
    errors <- which(unlist(lapply(results.list, length))<18)
    if (length(errors)>0){
        results.list <- results.list[-errors]
    }

    # information message
    if (verbose==TRUE) cat(length(results.list), "tweets have been parsed.", "\n")
    return(results.list)
}

<强>输出:

#Parse tweets, displaying only 2 columns for space constraints

tweets.df<-parseTweets_new(example_tweets)
#> tweets.df[,grep('profile',colnames(tweets.df))[1:2]]
#   profile_background_color                                                     profile_background_image_url
#1                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#2                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#3                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#4                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#5                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#6                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#7                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#8                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#9                    C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png
#10                   C0DEED http://a0.twimg.com/profile_background_images/656927849/miyt9dpjz77sc0w3d4vj.png

正如我们所看到的,此代码适用于添加新变量,unlistWithNA(results.list, c('user', "profile_background_color"))的输出是什么。我怀疑您的推文数据存在问题,请告诉我们......