我在下面的代码中嵌套了一个for循环。
这遍历每一列和每一行-是否有一种简单的方法可以对此向量化?
FYI-循环的内容验证每个条目中的列表是否仅包含NA,从而可以删除整个列。
# install.packages("rtweet")
library("rtweet")
rbloggers <- get_timeline(user = "Rbloggers", n = 10000)
View(rbloggers)
# install.packages("janitor")
library("janitor")
rbloggers <- janitor::remove_empty(rbloggers, which = "cols")
# this removes the columns with NA or blank - which are not in lists.
# readr::write_csv - would like to use this later and this cannot handle vector of type list.
rbloggers <- as.data.frame(rbloggers)
for (j in 1:ncol(rbloggers)){
x <- 0
for (i in 1:nrow(rbloggers)){
x <- x + all(is.na(rbloggers[i,j][[1]]))
}
# if every element is NA, then remove the column
if(x == nrow(rbloggers)) {rbloggers[,j] <- NULL}
# Many ways to remove a column:
# # Data[2] <- NULL
# # Data[[2]] <- NULL
# # Data <- Data[,-2]
# # Data <- Data[-2]
}
仅供参考-我试图理解以下参考文献:
答案 0 :(得分:0)
library(rtweet)
rbloggers <- get_timeline(user = "Rbloggers", n = 10000)
library(janitor)
rbloggers <- janitor::remove_empty(rbloggers, which = "cols")
# find the sum of NA in each col
colSums(is.na(rbloggers))
#> user_id status_id created_at
#> 0 0 0
#> screen_name text source
#> 0 0 0
#> display_text_width is_quote is_retweet
#> 0 0 0
#> favorite_count retweet_count hashtags
#> 0 0 0
#> urls_url urls_t.co urls_expanded_url
#> 0 0 0
#> mentions_user_id mentions_screen_name lang
#> 3175 3175 0
#> geo_coords coords_coords bbox_coords
#> 0 0 0
#> status_url name location
#> 0 0 0
#> description url protected
#> 0 0 0
#> followers_count friends_count listed_count
#> 0 0 0
#> statuses_count favourites_count account_created_at
#> 0 0 0
#> verified profile_url profile_expanded_url
#> 0 0 0
#> account_lang profile_background_url profile_image_url
#> 0 0 0
library(dplyr)
# remove the cols that consist of NA
rbloggers_clean <- rbloggers %>%
select(- mentions_user_id, -mentions_screen_name)