如何不使用嵌套的for循环并改善我的R代码?

时间:2019-04-29 09:18:55

标签: r vectorization

我在下面的代码中嵌套了一个for循环。

这遍历每一列和每一行-是否有一种简单的方法可以对此向量化?

FYI-循环的内容验证每个条目中的列表是否仅包含NA,从而可以删除整个列。


# install.packages("rtweet")
library("rtweet")             
rbloggers <- get_timeline(user = "Rbloggers", n = 10000)
View(rbloggers)
# install.packages("janitor")
library("janitor")             

rbloggers <- janitor::remove_empty(rbloggers, which = "cols")
# this removes the columns with NA or blank - which are not in lists.

# readr::write_csv - would like to use this later and this cannot handle vector of type list.

rbloggers <- as.data.frame(rbloggers)

for (j in 1:ncol(rbloggers)){

    x <- 0
    for (i in 1:nrow(rbloggers)){
      x <- x + all(is.na(rbloggers[i,j][[1]]))
    }

    # if every element is NA, then remove the column
    if(x == nrow(rbloggers)) {rbloggers[,j] <- NULL}

                            # Many ways to remove a column:
                            # # Data[2] <- NULL
                            # # Data[[2]] <- NULL
                            # # Data <- Data[,-2]
                            # # Data <- Data[-2]
}


仅供参考-我试图理解以下参考文献:

1 个答案:

答案 0 :(得分:0)

library(rtweet)             
rbloggers <- get_timeline(user = "Rbloggers", n = 10000)

library(janitor)             

rbloggers <- janitor::remove_empty(rbloggers, which = "cols")

# find the sum of NA in each col
colSums(is.na(rbloggers))
#>                user_id              status_id             created_at 
#>                      0                      0                      0 
#>            screen_name                   text                 source 
#>                      0                      0                      0 
#>     display_text_width               is_quote             is_retweet 
#>                      0                      0                      0 
#>         favorite_count          retweet_count               hashtags 
#>                      0                      0                      0 
#>               urls_url              urls_t.co      urls_expanded_url 
#>                      0                      0                      0 
#>       mentions_user_id   mentions_screen_name                   lang 
#>                   3175                   3175                      0 
#>             geo_coords          coords_coords            bbox_coords 
#>                      0                      0                      0 
#>             status_url                   name               location 
#>                      0                      0                      0 
#>            description                    url              protected 
#>                      0                      0                      0 
#>        followers_count          friends_count           listed_count 
#>                      0                      0                      0 
#>         statuses_count       favourites_count     account_created_at 
#>                      0                      0                      0 
#>               verified            profile_url   profile_expanded_url 
#>                      0                      0                      0 
#>           account_lang profile_background_url      profile_image_url 
#>                      0                      0                      0

library(dplyr)

# remove the cols that consist of NA
rbloggers_clean <- rbloggers %>% 
  select(- mentions_user_id, -mentions_screen_name)