如果否则梯子不能在R中工作

时间:2018-06-25 15:14:19

标签: r if-statement

在读取并重新排列多个csv文件后,我将其保存在数据框中。基本上,我希望if if else梯形图引用ID列,并且如果它与串联列表中的数字匹配,则在新的“组”列中放置一个单词

    # of int.   int.     not.int.  ID
1      50      218.41     372.16    1
3      33      134.94     158.17    3

然后我将这些串联连接起来进行引用。

veh = as.character(c('1', '5'))
thc1 = as.character(c('2', '6'))
thc2 = as.character(c('3', '7'))
thc3 = as.character(c('4', '8')) 

然后我创建了一个if if else阶梯以列出相应的值。

social.dat$group =  if (social.dat$ID == veh) {
     social.dat$group == "veh"
     } else if (social.dat$group == thc1) {
     social.dat$group == "thc1"
     } else if (social.dat$group == thc2) {
     social.dat$group == "thc2"
     } else {
     social.dat$group == "thc3"
     }

但是,我然后收到此警告消息。

Warning message:
In if (social.dat$ID == veh) { :
the condition has length > 1 and only the first element will be used

我以多种不同的形式查询了此警告消息,但没有发现真正有用的信息。任何帮助,将不胜感激,或者其他选择也将是很好。如果我错过了解决方案,我事先表示歉意。

编辑: 我尝试使用

 social.dat$group = ifelse(social.dat$ID == veh, "veh", "thc")
 social.dat$group = ifelse(social.dat$ID == thc, "thc", "veh")

但是它改变了每一行之后数据帧的输出。

这是我用来重新排列csv文件并获取我上面首先提到的数据帧的完整代码。

#calls packages
library(tidyr)
library( plyr )
library(reshape2)
#make sure to change your working directory to where the files are kept
setwd("C:/Users/callej03/Desktop/test")
wd = "C:/Users/callej03/Desktop/test"
files = list.files(path=wd, pattern="*.csv", full.names=TRUE, 
recursive=FALSE)

################################################################
#this function creates a list of the number of interactions for each file in 
the folder
lap.list = lapply(files, function(x) {
dat = read.csv(x, header= TRUE) 
dat = dat[-c(1),]
dat = as.data.frame(dat)
dat = separate(data = dat, col = dat, into = c("lap", "duration"), sep = "\\      
")
dat$count = 1:nrow(dat) 
y = dat$count
i= y%%2==0
dat$interacting = i
int = dat[which(dat$interacting == TRUE),]
interactions = sum(int$interacting)
})
#########################################################################
#this changes the row name to the name of the file - i.e. the rat ID
lap.list = as.data.frame(lap.list)
lap.list = t(lap.list)
colnames(lap.list) = c("# of int.")
row.names(lap.list) = sub(wd, "", files)
row.names(lap.list) = gsub("([0-9]+).*$", "\\1", rownames(lap.list))
row.names(lap.list) = gsub('/', "", row.names(lap.list), fixed = TRUE)

###########################################################################
#this applies almost the same function as the one listed above except I call 
it a different vector name so it can be manipulated
int.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE) 
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep = 
"\\      ")
dat2$count = 1:nrow(dat2) 
y = dat2$count
i= y%%2==0
dat2$interacting = i
int = dat2[which(dat2$interacting == TRUE),]
})

noint.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE) 
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep = 
"\\      ")
dat2$count = 1:nrow(dat2) 
y = dat2$count
i= y%%2==0
dat2$interacting = i
noint = dat2[which(dat2$interacting == FALSE),]
})
###################################################################
#this splits the output time of minutes, seconds, and milliseconds. 
#then it combines them into a total seconds.milliseconds readout.
#after that, it takes the sum of the times for each file and combines them 
with the total interactions dataframe.
int.duration = melt(int.duration)
int.duration = as.data.frame(int.duration)
int.left = as.data.frame(substr(int.duration$duration, 1, 2))
colnames(int.left) = "min"
int.mid = as.data.frame(substr(int.duration$duration, 4, 4 + 2 - 1))
colnames(int.mid) = "sec"
int.right = as.data.frame(substr(int.duration$duration, 
nchar(int.duration$duration) - (2-1), nchar(int.duration$duration)))
colnames(int.right) = "ms"
int.time = cbind(int.left, int.mid, int.right)
int.time$min = as.numeric(as.character(int.time$min))
int.time$sec = as.numeric(as.character(int.time$sec))
int.time$ms = as.numeric(as.character(int.time$ms))
int.time$ms = int.time$ms/100
int.time$min = ifelse(int.time$min > 0, int.time$min*60,0) 
int.time$sum = rowSums(int.time)
int.file = as.data.frame(int.duration$L1)
colnames(int.file) = "file"
int.time = cbind(int.time, int.file)
int.tot = as.data.frame(tapply(int.time$sum, int.time$file, sum))
colnames(int.tot) = "int."
social.dat = cbind(lap.list, int.tot)

noint.duration = melt(noint.duration)
noint.duration = as.data.frame(noint.duration)
noint.left = as.data.frame(substr(noint.duration$duration, 1, 2))
colnames(noint.left) = "min"
noint.mid = as.data.frame(substr(noint.duration$duration, 4, 4 + 2 - 1))
colnames(noint.mid) = "sec"
noint.right = as.data.frame(substr(noint.duration$duration, 
nchar(noint.duration$duration) - (2-1), nchar(noint.duration$duration)))
colnames(noint.right) = "ms"
noint.time = cbind(noint.left, noint.mid, noint.right)
noint.time$min = as.numeric(as.character(noint.time$min))
noint.time$sec = as.numeric(as.character(noint.time$sec))
noint.time$ms = as.numeric(as.character(noint.time$ms))
noint.time$ms = noint.time$ms/100
noint.time$min = ifelse(noint.time$min > 0, noint.time$min*60,0) 
noint.time$sum = rowSums(noint.time)
noint.file = as.data.frame(noint.duration$L1)
colnames(noint.file) = "file"
noint.time = cbind(noint.time, noint.file)
noint.tot = as.data.frame(tapply(noint.time$sum, noint.time$file, sum))
colnames(noint.tot) = "not.int."
social.dat = cbind(social.dat, noint.tot)
social.dat$ID = rownames(social.dat)

这是我正在使用的csv文件的示例。这些词都在同一列中,并用空格隔开。

Total time  10:00.61
Lap times
01      00:07.46
02      00:05.64
03      00:01.07
04      00:01.04
05      00:04.71
06      00:06.43
07      00:12.52
08      00:07.34
09      00:05.46
10      00:05.81
11      00:05.52
12      00:06.51
13      00:10.75
14      00:00.83
15      00:03.64
16      00:02.75
17      00:01.20
18      00:06.17
19      00:04.40
20      00:00.75
21      00:00.84
22      00:01.29
23      00:02.31
24      00:03.04
25      00:02.85
26      00:05.86
27      00:05.76
28      00:05.06
29      00:00.96
30      00:06.91

2 个答案:

答案 0 :(得分:3)

@akrun建议使用ifelse,它非常适合一两个嵌套。除此之外,我的个人喜好是在各种合并/联接中使用dplyr::case_when或单独的data.frame

如果您使用的是由相同字段(在这种情况下为id)一致分配的“简单情况”,那么合并/联接是我的首选方法:它使维护变得更加简单(IMO)。 (当我说“一致地由相同的字段” 时,我的意思是您可以有一个id1id2字段,通过它们可以定义各个记录及其适用的组。对于您的示例而言,可能太多了,因此,我会一键合并此答案。)

三种方法(下面的数据很远):

基本R

dat2a <- merge(dat, groups, by="id", all.x=TRUE)
dat2a
#   id int group
# 1  1  22   veh
# 2  2  33  thc1
# 3  3  44  <NA>

请注意,未包含在组定义中的任何id都将具有NA组。您可以使用以下方法分配默认组:

dat2a$group[is.na(dat2a$group)] <- "somedefaultgroup"
dat2a
#   id int            group
# 1  1  22              veh
# 2  2  33             thc1
# 3  3  44 somedefaultgroup

dplyr,合并/加入

概念相似,但使用dplyr式动词。

library(dplyr)
dat2c <- left_join(dat, groups, by="id") %>%
  mutate(group = if_else(is.na(group), "somedefaultgroup", group))

dplyr::case_when

(这不使用我为合并/合并案例定义的groups。)

如果您真的想对类似if / else的语句做一些阶梯/嵌套,case_when更易于阅读(和调试),并且可能会更快,具体取决于您的用例。

最直接:

library(dplyr)
dat2b <- dat
dat2b$group <- case_when(
  dat2b$id %in% c("1","5") ~ "veh",
  dat2b$id %in% c("2","6") ~ "thc1",
  TRUE                     ~ "somedefaultgroup"
)

使用with(...)比以前更容易阅读,但功能相同。 (如果您的“梯子”长得多,则可以大大减少代码高尔夫球(代码中的字符数)。)

dat2b <- dat
dat2b$group <- with(dat2b, case_when(
  id %in% c("1","5") ~ "veh",
  id %in% c("2","6") ~ "thc1",
  TRUE               ~ "somedefaultgroup"
))

如果您想使用一些dplyr动词,则:

dat2b <- dat
dat2b <- dat2b %>%
  mutate(
    group = case_when(
      id %in% c("1","5") ~ "veh",
      id %in% c("2","6") ~ "thc1",
      TRUE               ~ "somedefaultgroup"
    )
  )

数据

在执行合并/联接操作时,使用stringsAsFactors=FALSE很重要,这样就不会出现(新分配的组中的)因子级别不存在的问题。 (可以解决,但是...)

dat <- data.frame(id=c("1","2","3"), int=c(22L,33L,44L),
                  stringsAsFactors=FALSE)

上述合并示例的可选用法:

groups <- data.frame(id=c("1","5","2","6"), group=c("veh","veh","thc1","thc1"),
                  stringsAsFactors=FALSE)
groups
#   id group
# 1  1   veh
# 2  5   veh
# 3  2  thc1
# 4  6  thc1

前提是您为每个唯一的id定义一行。

答案 1 :(得分:0)

由于@ r2evans,以下代码完全按照我想要的方式工作(使用dplyr :: case_when)

social.dat$group = case_when(
  social.dat$ID %in% c("1","5") ~ "veh",
  social.dat$ID %in% c("2","6") ~ "thc1",
  social.dat$ID %in% c("3","7") ~ "thc2",
  social.dat$ID %in% c("4","8") ~ "thc3"
  )

这是数据框的最终输出

    # of int.   int.   not.int.   ID   group
1      50      218.41   372.16    1     veh
3      33      134.94   158.17    3    thc2