清除包含列表的列作为变量,在R中为空/ NULL / NA?

时间:2019-01-21 15:13:12

标签: r dataframe dplyr tidyverse

给出以下以列表为值的数据框:

  function myCallback(blob) {
    var url = window.URL.createObjectURL(blob);
    var filename = "myBlobFile.png";

    var a = document.createElement("a");
    a.style = "display: none";
    a.href = url;
    a.download = filename;

    // IE 11
    if (window.navigator.msSaveBlob !== undefined) {
      window.navigator.msSaveBlob(blob, filename);
      return;
    }

    document.body.appendChild(a);
    requestAnimationFrame(function() {
      a.click();
      window.URL.revokeObjectURL(url);
      document.body.removeChild(a);
    });
  }

  function makeBlob() {
    var blob = myDiagram.makeImageData( { background: "white", returnType: "blob", callback: myCallback });
  }

我要使用以下方式执行Jaccard计算:

df <- structure(list(keys.userId = c("9875", "5465", 
"1234", "4567", "8910"), user_data.SSIDs = list(
    c("qjJf5iZtYboSPvqe1oa/xg==", "ul7kroLEB2cZx6AMGhjnrA==", 
    "OYRT/hYu1Dl3/S5WIWyLHA==", "HFiSH/Tu0RSaQgIbDEZfeA==", "gUBxBfxjGdyPNzqYX7t6nA==", 
    "m7UqzqaXUm1GkXMbxf+SJw==", "KjxvZwsVCNSTtXXKiidmjw==", "3UOqe+4qPVZYXvja8GBEqg==", 
    "a9Ba8b19tY/bprM7WA326A==", "uStr9Fg+JlU9B+hdBCafZg==", "i9J11W00HFmoeCDObOfSdA==", 
    "A9vOz8zSrwDiQcKv8hk64Q==", "/8QeMoqFwd/eJ+/6NKk1iQ==", "HbyJqQxUfH6oiW3skqPzGQ==", 
    "51H9RcZmdRgkgg4X6U/mhQ==", "dwat86ppe1b/WXSaGi8r3w==", "yBCbMedxtZdiGFXmTfk2eQ==", 
    "wKSIsw3sDPRQhLIhdQkBJw==", "3LkFUenHOXWL4Be5T4XmaQ==", "Krni6eGQUnZuL/jU0MzKNA==", 
    "Wt9BCH4guyC4oSIHwE8XGA==", "rbgxp/3YPdHiownOdZHf+A==", "34rmNRgT/xFDXIDwHKIY6Q==", 
    "pT3zFcGdlJKmR+khJLMoVw==", "eni3X9I2B4KRK+sho2MbjA==", "sxuba/1Brg4CrYL8AFv8ZQ==", 
    "EYIxPOXPVvop99YD0vjXPA==", "JC1xqrtmQEaohwzviYDFYA==", "qv+cfEEqsIGrDFuEqpkQuw==", 
    "d3xXMR1RDKZdrDwQd97kNQ==", "qU1JULumBTqw+m/rLr4E7A==", "teDCJvNdyjktWD6leDpCmw==", 
    "ytSBHvzbEACq56aEHZlXEw==", "eV7WGimPD01weRI19ojO3g==", "vNkJyD9KOzOprGkYyfViMA==", 
    "r8jjZXWyax7JPfJUPFwRTQ==", "rJ1N3ONwDBK+jwFf+7xeHg==", "2xPbTqIww1KI/tVL2UH1cw==", 
    "1hk1AOU4DZXV52Auyr2FHA==", "aNH8uS5nrlwcHb8rLdZeXQ==", "5JPQs2z4N1Dru0dGI9ImBQ==", 
    "nbQIn5G4uyl8b1+A6aVkQg==", "A/UcwEccakKDuiATgoP1NA==", "JC57Ib2V7fOU/CgBk2R41g==", 
    "PsI8Ys++JveA+SuafbB8pg==", "eXiuBymYN+tcbjtpM9Vxmg==", "jEdP3Rs02d/4UE8G1GeE3A=="
    ), NULL, c("qjJf5iZtYboSPvqe1oa/xg==", "ul7kroLEB2cZx6AMGhjnrA==", 
    "OYRT/hYu1Dl3/S5WIWyLHA==", "HFiSH/Tu0RSaQgIbDEZfeA==", "gUBxBfxjGdyPNzqYX7t6nA==", 
    "m7UqzqaXUm1GkXMbxf+SJw==", "KjxvZwsVCNSTtXXKiidmjw==", "3UOqe+4qPVZYXvja8GBEqg==", 
    "a9Ba8b19tY/bprM7WA326A==", "uStr9Fg+JlU9B+hdBCafZg==", "i9J11W00HFmoeCDObOfSdA==", 
    "A9vOz8zSrwDiQcKv8hk64Q==", "/8QeMoqFwd/eJ+/6NKk1iQ==", "HbyJqQxUfH6oiW3skqPzGQ==", 
    "51H9RcZmdRgkgg4X6U/mhQ==", "dwat86ppe1b/WXSaGi8r3w==", "yBCbMedxtZdiGFXmTfk2eQ==", 
    "wKSIsw3sDPRQhLIhdQkBJw==", "3LkFUenHOXWL4Be5T4XmaQ==", "Krni6eGQUnZuL/jU0MzKNA==", 
    "Wt9BCH4guyC4oSIHwE8XGA==", "rbgxp/3YPdHiownOdZHf+A==", "34rmNRgT/xFDXIDwHKIY6Q==", 
    "pT3zFcGdlJKmR+khJLMoVw==", "eni3X9I2B4KRK+sho2MbjA==", "sxuba/1Brg4CrYL8AFv8ZQ==", 
    "EYIxPOXPVvop99YD0vjXPA==", "JC1xqrtmQEaohwzviYDFYA==", "qv+cfEEqsIGrDFuEqpkQuw==", 
    "d3xXMR1RDKZdrDwQd97kNQ==", "qU1JULumBTqw+m/rLr4E7A==", "teDCJvNdyjktWD6leDpCmw==", 
    "ytSBHvzbEACq56aEHZlXEw==", "eV7WGimPD01weRI19ojO3g==", "vNkJyD9KOzOprGkYyfViMA==", 
    "r8jjZXWyax7JPfJUPFwRTQ==", "rJ1N3ONwDBK+jwFf+7xeHg==", "2xPbTqIww1KI/tVL2UH1cw==", 
    "1hk1AOU4DZXV52Auyr2FHA==", "aNH8uS5nrlwcHb8rLdZeXQ==", "5JPQs2z4N1Dru0dGI9ImBQ==", 
    "nbQIn5G4uyl8b1+A6aVkQg==", "A/UcwEccakKDuiATgoP1NA==", "JC57Ib2V7fOU/CgBk2R41g==", 
    "PsI8Ys++JveA+SuafbB8pg==", "eXiuBymYN+tcbjtpM9Vxmg==", "jEdP3Rs02d/4UE8G1GeE3A=="
    ), NULL, c("qjJf5iZtYboSPvqe1oa/xg==", "ul7kroLEB2cZx6AMGhjnrA==", 
    "OYRT/hYu1Dl3/S5WIWyLHA==", "HFiSH/Tu0RSaQgIbDEZfeA==", "gUBxBfxjGdyPNzqYX7t6nA==", 
    "m7UqzqaXUm1GkXMbxf+SJw==", "KjxvZwsVCNSTtXXKiidmjw==", "3UOqe+4qPVZYXvja8GBEqg==", 
    "a9Ba8b19tY/bprM7WA326A==", "uStr9Fg+JlU9B+hdBCafZg==", "i9J11W00HFmoeCDObOfSdA==", 
    "A9vOz8zSrwDiQcKv8hk64Q==", "/8QeMoqFwd/eJ+/6NKk1iQ==", "HbyJqQxUfH6oiW3skqPzGQ==", 
    "51H9RcZmdRgkgg4X6U/mhQ==", "dwat86ppe1b/WXSaGi8r3w==", "yBCbMedxtZdiGFXmTfk2eQ==", 
    "wKSIsw3sDPRQhLIhdQkBJw==", "3LkFUenHOXWL4Be5T4XmaQ==", "Krni6eGQUnZuL/jU0MzKNA==", 
    "Wt9BCH4guyC4oSIHwE8XGA==", "rbgxp/3YPdHiownOdZHf+A==", "34rmNRgT/xFDXIDwHKIY6Q==", 
    "pT3zFcGdlJKmR+khJLMoVw==", "eni3X9I2B4KRK+sho2MbjA==", "sxuba/1Brg4CrYL8AFv8ZQ==", 
    "EYIxPOXPVvop99YD0vjXPA==", "JC1xqrtmQEaohwzviYDFYA==", "qv+cfEEqsIGrDFuEqpkQuw==", 
    "d3xXMR1RDKZdrDwQd97kNQ==", "qU1JULumBTqw+m/rLr4E7A==", "teDCJvNdyjktWD6leDpCmw==", 
    "ytSBHvzbEACq56aEHZlXEw==", "eV7WGimPD01weRI19ojO3g==", "vNkJyD9KOzOprGkYyfViMA==", 
    "r8jjZXWyax7JPfJUPFwRTQ==", "rJ1N3ONwDBK+jwFf+7xeHg==", "2xPbTqIww1KI/tVL2UH1cw==", 
    "1hk1AOU4DZXV52Auyr2FHA==", "aNH8uS5nrlwcHb8rLdZeXQ==", "5JPQs2z4N1Dru0dGI9ImBQ==", 
    "nbQIn5G4uyl8b1+A6aVkQg==", "A/UcwEccakKDuiATgoP1NA==", "JC57Ib2V7fOU/CgBk2R41g==", 
    "PsI8Ys++JveA+SuafbB8pg==", "eXiuBymYN+tcbjtpM9Vxmg==", "jEdP3Rs02d/4UE8G1GeE3A=="
    )), user_data.contacts = list(list(), NULL, list(), NULL, 
    list())), row.names = c(NA, 5L), class = "data.frame")

我想稍微清理一下数据。

我正在尝试清除NULL / NA / 0长度列表行。

请告知执行此操作的正确方法,我首先想到的是:

jaccard <- function(vector1, vector2) {

  return(length(intersect(vector1, vector2)) / 
           length(union(vector1, vector2)))

}

jaccardV <- Vectorize(jaccard)

但是它到处都返回False。

有什么方法可以正确有效地做到这一点?请告知。

最后清洗后,我将运行:

df %>% 
    dplyr::mutate(isNull = ifelse(is.null(unlist(user_data.contacts)), TRUE, FALSE))

3 个答案:

答案 0 :(得分:1)

这是一种实现方法:

library(dplyr)
library(purrr)
df %>% 
  filter(!map_lgl(user_data.contacts, is.null)) %>% 
  filter(!map_lgl(user_data.contacts, function(x) length(x) == 0)) %>% 
  filter(!map_lgl(user_data.contacts, is.na)) %>% 
  mutate(contacts_jaccard = jaccardV(user_data.contacts, lag(user_data.contacts)))

这不会产生任何输出,因为在您提供的模拟数据中,所有行现在都已删除。如果唯一的目的是稍后删除行,则不必创建新列isNull。我开始更喜欢map而不是sapply,因为强制达到某个结果确实很容易。在这种情况下,map_lgl仅会产生TRUE / FALSE结果。

请注意,如果列表列的一个元素长于1,则应改用此元素:

df %>% 
  filter(!map_lgl(user_data.contacts, is.null)) %>% 
  filter(!map_lgl(user_data.contacts, function(x) length(x) == 0)) %>% 
  filter(!map_lgl(user_data.contacts, function(x) is.na(x)[1])) %>% 
  mutate(contacts_jaccard = jaccardV(user_data.contacts, lag(user_data.contacts)))

答案 1 :(得分:1)

我认为这应该对您有用。浏览“联系人”列,创建一个新列,告知它是否为空,然后根据新列将其过滤掉

library(tidyverse)

new_df <- df %>%as_tibble() %>%
    mutate(is_Null = sapply(user_data.contacts, is_null)) %>%
    filter(is_Null == FALSE)

new_df

答案 2 :(得分:1)

user_data.contacts是一个列表,当您对该列进行更改时,会将其视为一个列表。尝试在rowwise之前将数据mutate分组。

library(dplyr)

df %>% 
  rowwise() %>% 
  mutate(isNull = is.null(unlist(user_data.contacts)))


# A tibble: 5 x 4 # added data to 1st observation for test
  keys.userId user_data.SSIDs user_data.contacts isNull
  <chr>       <list>          <list>             <lgl> 
1 9875        <chr [47]>      <list [3]>         FALSE 
2 5465        <NULL>          <NULL>             TRUE  
3 1234        <chr [47]>      <list [0]>         TRUE  
4 4567        <NULL>          <NULL>             TRUE  
5 8910        <chr [47]>      <list [0]>         TRUE