我试图抓住Reddit数据(我对网络抓取非常陌生,而在R上却有一半体面)。 RedditExtractor包具有很好的功能,可以满足我所需要的90%,但它并没有抓住"天赋"与发表评论的用户相关联。我试图玩这个包的功能,但我有点过头了。
有一些带有flair here的Reddit线程的例子。我想我正在寻找这些XML中的文本:
<span class="flair flair-orthodox" title="Eastern Orthodox">Eastern Orthodox</span>
我已经粘贴了reddit_content()
函数中的代码以及我认为额外代码应该发布的评论,但我不太确定从何处开始。目前,该函数返回一个数据框,其中包含注释,时间戳,用户等的列。如果它们存在,我还需要用它来生成用户注释的注释。提前谢谢!
redd_content_flair <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(),
#flair = character(),
URL = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE)),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE)), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"),
comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments,
subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})),
comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})),
controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})),
comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})),
title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
#flair = unlist(lapply(main.node, function(x) {GetAttribute(x, "flair")})),
URL = URL[i], stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}
编辑:我还想抓住&#34;父母的网址&#34;评论,看起来像
中的标签<p class="parent"><a name="d3t1p1r"></a></p>
答案 0 :(得分:2)
我设法提出了一个临时解决方案。我会在这里张贴后代。问题是函数as-is未设置为处理NULL JSON值。这是一个快速解决方案。
关于中途,有两条raw_data =
行。您需要将nullValue = 'your null text'
参数添加到fromJSON
函数中。然后,您可以使用与其他地方相同的结构,将您想要的任何元数据添加到空数据框和TEMP数据框中。在下面的函数中,我添加了用户的天赋文本和父评论的ID。
(请注意,不完美的缩进来自原始功能......我已将其保留为原因,以防止意外更改某些内容。)
reddit.fixed <- function (URL, wait_time = 2)
{
if (is.null(URL) | length(URL) == 0 | !is.character(URL)) {
stop("invalid URL parameter")
}
GetAttribute = function(node, feature) {
Attribute = node$data[[feature]]
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(Attribute, lapply(reply.nodes, function(x) {
GetAttribute(x, feature)
})))
}
get.structure = function(node, depth = 0) {
if (is.null(node)) {
return(list())
}
filter = is.null(node$data$author)
replies = node$data$replies
reply.nodes = if (is.list(replies))
replies$data$children
else NULL
return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes),
function(x) get.structure(reply.nodes[[x]], paste0(depth,
"_", x)))))
}
data_extract = data.frame(id = numeric(), structure = character(),
post_date = as.Date(character()), comm_date = as.Date(character()),
num_comments = numeric(), subreddit = character(), upvote_prop = numeric(),
post_score = numeric(), author = character(), user = character(),
comment_score = numeric(), controversiality = numeric(),
comment = character(), title = character(), post_text = character(),
link = character(), domain = character(), URL = character(), flair = character(), parent = character())
pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3)
for (i in seq(URL)) {
if (!grepl("^https?://(.*)", URL[i]))
URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)",
"\\1", URL[i]))
if (!grepl("\\?ref=search_posts$", URL[i]))
URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts")
X = paste0(gsub("\\?ref=search_posts$", "", URL[i]),
".json?limit=500")
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE), nullValue = "none"),
error = function(e) NULL)
if (is.null(raw_data)) {
Sys.sleep(min(1, wait_time))
raw_data = tryCatch(RJSONIO::fromJSON(readLines(X,
warn = FALSE), nullValue = "none"), error = function(e) NULL)
}
if (is.null(raw_data) == FALSE) {
meta.node = raw_data[[1]]$data$children[[1]]$data
main.node = raw_data[[2]]$data$children
if (min(length(meta.node), length(main.node)) > 0) {
structure = unlist(lapply(1:length(main.node),
function(x) get.structure(main.node[[x]], x)))
TEMP = data.frame(id = NA, structure = gsub("FALSE ",
"", structure[!grepl("TRUE", structure)]),
post_date = format(as.Date(as.POSIXct(meta.node$created_utc,
origin = "1970-01-01")), "%d-%m-%y"), comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node,
function(x) {
GetAttribute(x, "created_utc")
})), origin = "1970-01-01")), "%d-%m-%y"),
num_comments = meta.node$num_comments, subreddit = ifelse(is.null(meta.node$subreddit),
"UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio,
post_score = meta.node$score, author = meta.node$author,
user = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author")
})), comment_score = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "score")
})), controversiality = unlist(lapply(main.node,
function(x) {
GetAttribute(x, "controversiality")
})), comment = unlist(lapply(main.node, function(x) {
GetAttribute(x, "body")
})), title = meta.node$title, post_text = meta.node$selftext,
link = meta.node$url, domain = meta.node$domain,
URL = URL[i],
flair = unlist(lapply(main.node, function(x) {
GetAttribute(x, "author_flair_text")
})),
parent = unlist(lapply(main.node, function(x) {GetAttribute(x, "parent_id")})),
stringsAsFactors = FALSE)
TEMP$id = 1:nrow(TEMP)
if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0)
data_extract = rbind(TEMP, data_extract)
else print(paste("missed", i, ":", URL[i]))
}
}
utils::setTxtProgressBar(pb, i)
Sys.sleep(min(2, wait_time))
}
close(pb)
return(data_extract)
}