R RedditExtractoR包解决方法

时间:2016-09-21 16:19:09

标签: r

我正在尝试使用RedditExtractoR包,因为我过去曾多次使用它。自上个月以来我还没有使用它,但本周当我尝试使用它时,它返回一个空数据帧。

get_reddit(subreddit="jokes")
 |======================================================================================================================================| 100%
     [1] id               structure        post_date        comm_date        num_comments     subreddit        upvote_prop      post_score      
     [9] author           user             comment_score    controversiality comment          title            post_text        link            
    [17] domain           URL             
    <0 rows> (or 0-length row.names)'

我探索了函数get_reddit(),它似乎使用函数reddit_urls(),然后获取url并将该页面加载为JSON。 reddit_urls()函数返回一个包含Reddit页面网址的数据框,并且在网址末尾附加.JSON似乎仍然将页面加载为JSON对象。

是否有其他人对此软件包有疑问和/或他们是否知道将JSON对象解析为datafrme的解决方法?

谢谢

1 个答案:

答案 0 :(得分:1)

我有同样的问题。这是我对它的解决方法......

如果您移除了https?://的选项,则必须在https://中将其设为reddit_content();所以函数看起来像:

reddit_content <- function (URL, wait_time = 2) 
{
    if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
        stop("invalid URL parameter")
    }
    GetAttribute = function(node, feature) {
        Attribute = node$data[[feature]]
        replies = node$data$replies
        reply.nodes = if (is.list(replies)) 
            replies$data$children
        else NULL
        return(list(Attribute, lapply(reply.nodes, function(x) {
            GetAttribute(x, feature)
        })))
    }
    get.structure = function(node, depth = 0) {
        if (is.null(node)) {
            return(list())
        }
        filter = is.null(node$data$author)
        replies = node$data$replies
        reply.nodes = if (is.list(replies)) 
            replies$data$children
        else NULL
        return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes), 
            function(x) get.structure(reply.nodes[[x]], paste0(depth, 
                "_", x)))))
    }
    data_extract = data.frame(id = numeric(), structure = character(), 
        post_date = as.Date(character()), comm_date = as.Date(character()), 
        num_comments = numeric(), subreddit = character(), upvote_prop = numeric(), 
        post_score = numeric(), author = character(), user = character(), 
        comment_score = numeric(), controversiality = numeric(), 
        comment = character(), title = character(), post_text = character(), 
        link = character(), domain = character(), URL = character())
    pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
    for (i in seq(URL)) {
        if (!grepl("^https://(.*)", URL[i])) 
            URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)", 
                "\\1", URL[i]))
        if (!grepl("\\?ref=search_posts$", URL[i])) 
            URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
        X = paste0(gsub("\\?ref=search_posts$", "", URL[i]), 
            ".json?limit=500")
        raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE)), 
            error = function(e) NULL)
        if (is.null(raw_data)) {
            Sys.sleep(min(1, wait_time))
            raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, 
                warn = FALSE)), error = function(e) NULL)
        }
        if (is.null(raw_data) == FALSE) {
            meta.node = raw_data[[1]]$data$children[[1]]$data
            main.node = raw_data[[2]]$data$children
            if (min(length(meta.node), length(main.node)) > 0) {
                structure = unlist(lapply(1:length(main.node), 
                  function(x) get.structure(main.node[[x]], x)))
                TEMP = data.frame(id = NA, structure = gsub("FALSE ", 
                  "", structure[!grepl("TRUE", structure)]), 
                  post_date = format(as.Date(as.POSIXct(meta.node$created_utc, 
                    origin = "1970-01-01")), "%d-%m-%y"), comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node, 
                    function(x) {
                      GetAttribute(x, "created_utc")
                    })), origin = "1970-01-01")), "%d-%m-%y"), 
                  num_comments = meta.node$num_comments, subreddit = ifelse(is.null(meta.node$subreddit), 
                    "UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio, 
                  post_score = meta.node$score, author = meta.node$author, 
                  user = unlist(lapply(main.node, function(x) {
                    GetAttribute(x, "author")
                  })), comment_score = unlist(lapply(main.node, 
                    function(x) {
                      GetAttribute(x, "score")
                    })), controversiality = unlist(lapply(main.node, 
                    function(x) {
                      GetAttribute(x, "controversiality")
                    })), comment = unlist(lapply(main.node, function(x) {
                    GetAttribute(x, "body")
                  })), title = meta.node$title, post_text = meta.node$selftext, 
                  link = meta.node$url, domain = meta.node$domain, 
                  URL = URL[i], stringsAsFactors = FALSE)
                TEMP$id = 1:nrow(TEMP)
                if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0) 
                  data_extract = rbind(TEMP, data_extract)
                else print(paste("missed", i, ":", URL[i]))
            }
        }
        utils::setTxtProgressBar(pb, i)
        Sys.sleep(min(2, wait_time))
    }
    close(pb)
    return(data_extract)
}

然后将get_reddit()功能重置为:

get_reddit <- function (search_terms = NA, regex_filter = "", subreddit = NA, 
    cn_threshold = 1, page_threshold = 1, sort_by = "comments", 
    wait_time = 2) 
{
    URL = unique(as.character(reddit_urls(search_terms, regex_filter, 
        subreddit, cn_threshold, page_threshold, sort_by, wait_time)$URL))
    retrieved_data = reddit_content(URL, wait_time)
    return(retrieved_data)
}