Question

我有RM流程涉及＆＃FP; FP Growth＆＃39; operator-＆gt;＆＃39;应用关联规则＆＃39;运营商。数据表1是该RM过程的输出。

现在，我想写一个R程序，数据表1作为输入，数据表2作为输出

数据表1

 id confidence(a)   a   confidence(b)   b
100   1           TRUE        1        FALSE
101   0           TRUE        0        FALSE
102   1           TRUE        1        TRUE
103   0           TRUE        0        TRUE

数据表2

 ID  SET
100   b
101  none
102  none
103  none

这里是获取数据表2表的逻辑 -

对于特定的id，当置信度（a / b / nothing）= 0时，忽略它们
对于特定的ID，当信心（a）＆＃39;是1和项目的值＆＃39; a＆＃39;是＆＃39; TRUE＆＃39; - ＆GT;忽略他们
对于特定的ID，当信心（a）＆＃39;是1和＆＃39; a＆＃39;的值是＆＃39;错误＆＃39; - ＆GT;在＆＃39; SET＆＃39;中提到这一点。列（如数据表2所示）
所以当confidence（）= 1且项目值= FALSE时，所有带有此数据的项目都应存储到＆＃39; SET＆＃39;每个项目用逗号分隔的列

请告诉我如何获取数据表2。任何帮助将不胜感激。谢谢

Answer 1

我将您的数据表1重新创建为dt1：

dt1 <- data.frame(id=100:103, confidencea=c(1,0,1,0), a=rep(T,4), confidenceb=c(1,0,1,0), b=c(F,F,T,T))

str(dt1)
'data.frame':   4 obs. of  5 variables:
 $ id         : int  100 101 102 103
 $ confidencea: num  1 0 1 0
 $ a          : logi  TRUE TRUE TRUE TRUE
 $ confidenceb: num  1 0 1 0
 $ b          : logi  FALSE FALSE TRUE TRUE

#For the code below to work, the columns of dt1 must be in the format above. 
#If not, convert them to the correct format.
#For example, if 'id' is in the format 'factor', use 'as.integer(as.character(dt1$id))'.
#For example, if 'a' is in the format 'character', use 'as.logical(dt1$a)'.

然后下面的代码创建dt2：

#Find columns where confidence is non-zero
dt <- dt1[which(rowSums(dt1[grep('confidence', names(dt1))]) > 0),]

#Find columns where value of item corresponding to a non-zero confidence is FALSE
dtnames <- names(which(colSums(dt[grep('^(?!confidence|id)', names(dt), perl = T)]) < nrow(dt[grep('^(?!confidence|id)', names(dt), perl = T)])))
dt <- dt[,c(1, which(names(dt) %in% dtnames))]
dt <- dt[which(rowSums(dt[-1]) < (ncol(dt)-1)),]

#Collect names to be listed in SET column
dt$SET <- 'none'
for(i in 1:nrow(dt)){
  name <- character()
  for(j in 2:ncol(dt)){
    if(dt[i,j]==F){
      name <- c(name, names(dt[j]))
    }
  }
  dt$SET[i] <- paste(name, collapse=',')
}

#Keep only SET column and fill missing values with 'none'
dt <- dt[c('id','SET')]
dt2 <- merge(dt1[1],dt, all.x = T)
dt2$SET[is.na(dt2$SET)] <- 'none'

Answer 2

解决此问题的一种方法是使用df1将第一个数据框（此处命名为tidyr::gather）从宽格式转换为长格式，然后对生成的键和值数据列进行操作（按{分组） {1}}）以行成对方式评估生成的id中需要的内容。然后，SET使用函数将行折叠为字符串（字符向量）。要使这种方法起作用，我们必须具有交替置信度和相应的项目列，如OP中所示。

我们首先定义两个函数：

summarise

第一个函数以成对方式在长格式列中运行，以根据条件为每对行生成f <- function(k, v) { o <- rep(NA,length(v)) o[c(FALSE,TRUE)] <- ifelse(v[c(TRUE,FALSE)]==1 & v[c(FALSE,TRUE)]==0, k[c(FALSE,TRUE)], NA) o } f2 <- function(x) { x <- na.omit(x) if (length(x)==0) "none" else toString(x) }或(NA, NA)。第二个函数删除所有(NA, item)，然后将行折叠为逗号分隔的字符串。如果行中的所有元素都是NA，则此函数会根据请求返回NA。使用这些功能，使用"none"：

dplyr

关于您的数据：

library(dplyr)
library(tidyr)
df2 <- df1 %>% gather(key="key", value="Set", -id) %>%
               group_by(id) %>% 
               mutate(Set=f(key,Set)) %>% 
               select(-key) %>%
               summarise(Set=f2(Set))

现在有一个稍微有趣的数据集，其中有多个项目满足条件：

print(df2)
### A tibble: 4 x 2
##     id   Set
##  <int> <chr>
##1   100     b
##2   101  none
##3   102  none
##4   103  none

我们得到：

df1 <- structure(list(id = 100:103, `confidence(a)` = c(1L, 0L, 1L, 
0L), a = c(TRUE, TRUE, TRUE, TRUE), `confidence(b)` = c(1L, 0L, 
1L, 0L), b = c(FALSE, FALSE, TRUE, TRUE), `confidence(c)` = c(1L, 
0L, 1L, 0L), c = c(FALSE, TRUE, FALSE, FALSE)), .Names = c("id", 
"confidence(a)", "a", "confidence(b)", "b", "confidence(c)", 
"c"), class = "data.frame", row.names = c(NA, -4L))
##   id confidence(a)    a confidence(b)     b confidence(c)     c
##1 100             1 TRUE             1 FALSE             1 FALSE
##2 101             0 TRUE             0 FALSE             0  TRUE
##3 102             1 TRUE             1  TRUE             1 FALSE
##4 103             0 TRUE             0  TRUE             0 FALSE

Answer 3

这可能不是最好的方法。但是，它可以使用简单的ifelse（）和for循环来完成。以下代码可以执行给定示例所需的操作。您可能需要根据您的要求进行修改。

# Explaining Variables
id = c(100,101,102,103)
confidence_a = c(1,0,1,0)
a = c(TRUE ,TRUE,TRUE,TRUE)
confidence_b = c(1,0,1,0)
b = c(FALSE,FALSE,TRUE,TRUE)

# Create Dataframe using above declared variables
Data_Table_1 =  data.frame(id, confidence_a,a,confidence_b , b)
colnames(Data_Table_1) = c("id" , "confidence_a" , "a" , "confidence_b"      ,"b")

# Defining Null Vectors
SET = c()
SET_a = c()
SET_b = c()
# Loop to iterate over each of the row 
for( i in 1:nrow(Data_Table_1)){

      SET_a[i] =  ifelse((Data_Table_1$confidence_a[i] == 0),"none",
                    ifelse((Data_Table_1$confidence_b[i] == 0),"none",
                        ifelse(Data_Table_1[i ,3] == "FALSE" ,colnames(Data_Table_1)[3],"none")))

            SET_b[i] =  ifelse((Data_Table_1$confidence_a[i] == 0),"none",
                            ifelse((Data_Table_1$confidence_b[i] == 0),"none",
                                ifelse(Data_Table_1[i ,5] == "FALSE" ,colnames(Data_Table_1)[5],"none")))

                    SET[i]   = ifelse((SET_a[i] != "none")&(SET_b[i]!= "none") ,paste(SET_a[i] ,SET_b[i],sep = "," ), 
                                   ifelse(SET_a[i] != "none",SET_a[i] ,
                                       ifelse(SET_b[i] !="none", SET_b[i],"none")))}

您可以优化给定代码，在列上添加更多循环。

如何通过在R

3 个答案: