在读取并重新排列多个csv文件后,我将其保存在数据框中。基本上,我希望if if else梯形图引用ID列,并且如果它与串联列表中的数字匹配,则在新的“组”列中放置一个单词
# of int. int. not.int. ID
1 50 218.41 372.16 1
3 33 134.94 158.17 3
然后我将这些串联连接起来进行引用。
veh = as.character(c('1', '5'))
thc1 = as.character(c('2', '6'))
thc2 = as.character(c('3', '7'))
thc3 = as.character(c('4', '8'))
然后我创建了一个if if else阶梯以列出相应的值。
social.dat$group = if (social.dat$ID == veh) {
social.dat$group == "veh"
} else if (social.dat$group == thc1) {
social.dat$group == "thc1"
} else if (social.dat$group == thc2) {
social.dat$group == "thc2"
} else {
social.dat$group == "thc3"
}
但是,我然后收到此警告消息。
Warning message:
In if (social.dat$ID == veh) { :
the condition has length > 1 and only the first element will be used
我以多种不同的形式查询了此警告消息,但没有发现真正有用的信息。任何帮助,将不胜感激,或者其他选择也将是很好。如果我错过了解决方案,我事先表示歉意。
编辑: 我尝试使用
social.dat$group = ifelse(social.dat$ID == veh, "veh", "thc")
social.dat$group = ifelse(social.dat$ID == thc, "thc", "veh")
但是它改变了每一行之后数据帧的输出。
这是我用来重新排列csv文件并获取我上面首先提到的数据帧的完整代码。
#calls packages
library(tidyr)
library( plyr )
library(reshape2)
#make sure to change your working directory to where the files are kept
setwd("C:/Users/callej03/Desktop/test")
wd = "C:/Users/callej03/Desktop/test"
files = list.files(path=wd, pattern="*.csv", full.names=TRUE,
recursive=FALSE)
################################################################
#this function creates a list of the number of interactions for each file in
the folder
lap.list = lapply(files, function(x) {
dat = read.csv(x, header= TRUE)
dat = dat[-c(1),]
dat = as.data.frame(dat)
dat = separate(data = dat, col = dat, into = c("lap", "duration"), sep = "\\
")
dat$count = 1:nrow(dat)
y = dat$count
i= y%%2==0
dat$interacting = i
int = dat[which(dat$interacting == TRUE),]
interactions = sum(int$interacting)
})
#########################################################################
#this changes the row name to the name of the file - i.e. the rat ID
lap.list = as.data.frame(lap.list)
lap.list = t(lap.list)
colnames(lap.list) = c("# of int.")
row.names(lap.list) = sub(wd, "", files)
row.names(lap.list) = gsub("([0-9]+).*$", "\\1", rownames(lap.list))
row.names(lap.list) = gsub('/', "", row.names(lap.list), fixed = TRUE)
###########################################################################
#this applies almost the same function as the one listed above except I call
it a different vector name so it can be manipulated
int.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE)
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep =
"\\ ")
dat2$count = 1:nrow(dat2)
y = dat2$count
i= y%%2==0
dat2$interacting = i
int = dat2[which(dat2$interacting == TRUE),]
})
noint.duration = lapply(files, function(x) {
dat2 = read.csv(x, header= TRUE)
dat2 = dat2[-c(1),]
dat2 = as.data.frame(dat2)
dat2 = separate(data = dat2, col = dat2, into = c("lap", "duration"), sep =
"\\ ")
dat2$count = 1:nrow(dat2)
y = dat2$count
i= y%%2==0
dat2$interacting = i
noint = dat2[which(dat2$interacting == FALSE),]
})
###################################################################
#this splits the output time of minutes, seconds, and milliseconds.
#then it combines them into a total seconds.milliseconds readout.
#after that, it takes the sum of the times for each file and combines them
with the total interactions dataframe.
int.duration = melt(int.duration)
int.duration = as.data.frame(int.duration)
int.left = as.data.frame(substr(int.duration$duration, 1, 2))
colnames(int.left) = "min"
int.mid = as.data.frame(substr(int.duration$duration, 4, 4 + 2 - 1))
colnames(int.mid) = "sec"
int.right = as.data.frame(substr(int.duration$duration,
nchar(int.duration$duration) - (2-1), nchar(int.duration$duration)))
colnames(int.right) = "ms"
int.time = cbind(int.left, int.mid, int.right)
int.time$min = as.numeric(as.character(int.time$min))
int.time$sec = as.numeric(as.character(int.time$sec))
int.time$ms = as.numeric(as.character(int.time$ms))
int.time$ms = int.time$ms/100
int.time$min = ifelse(int.time$min > 0, int.time$min*60,0)
int.time$sum = rowSums(int.time)
int.file = as.data.frame(int.duration$L1)
colnames(int.file) = "file"
int.time = cbind(int.time, int.file)
int.tot = as.data.frame(tapply(int.time$sum, int.time$file, sum))
colnames(int.tot) = "int."
social.dat = cbind(lap.list, int.tot)
noint.duration = melt(noint.duration)
noint.duration = as.data.frame(noint.duration)
noint.left = as.data.frame(substr(noint.duration$duration, 1, 2))
colnames(noint.left) = "min"
noint.mid = as.data.frame(substr(noint.duration$duration, 4, 4 + 2 - 1))
colnames(noint.mid) = "sec"
noint.right = as.data.frame(substr(noint.duration$duration,
nchar(noint.duration$duration) - (2-1), nchar(noint.duration$duration)))
colnames(noint.right) = "ms"
noint.time = cbind(noint.left, noint.mid, noint.right)
noint.time$min = as.numeric(as.character(noint.time$min))
noint.time$sec = as.numeric(as.character(noint.time$sec))
noint.time$ms = as.numeric(as.character(noint.time$ms))
noint.time$ms = noint.time$ms/100
noint.time$min = ifelse(noint.time$min > 0, noint.time$min*60,0)
noint.time$sum = rowSums(noint.time)
noint.file = as.data.frame(noint.duration$L1)
colnames(noint.file) = "file"
noint.time = cbind(noint.time, noint.file)
noint.tot = as.data.frame(tapply(noint.time$sum, noint.time$file, sum))
colnames(noint.tot) = "not.int."
social.dat = cbind(social.dat, noint.tot)
social.dat$ID = rownames(social.dat)
这是我正在使用的csv文件的示例。这些词都在同一列中,并用空格隔开。
Total time 10:00.61
Lap times
01 00:07.46
02 00:05.64
03 00:01.07
04 00:01.04
05 00:04.71
06 00:06.43
07 00:12.52
08 00:07.34
09 00:05.46
10 00:05.81
11 00:05.52
12 00:06.51
13 00:10.75
14 00:00.83
15 00:03.64
16 00:02.75
17 00:01.20
18 00:06.17
19 00:04.40
20 00:00.75
21 00:00.84
22 00:01.29
23 00:02.31
24 00:03.04
25 00:02.85
26 00:05.86
27 00:05.76
28 00:05.06
29 00:00.96
30 00:06.91
答案 0 :(得分:3)
@akrun建议使用ifelse
,它非常适合一两个嵌套。除此之外,我的个人喜好是在各种合并/联接中使用dplyr::case_when
或单独的data.frame
。
如果您使用的是由相同字段(在这种情况下为id
)一致分配的“简单情况”,那么合并/联接是我的首选方法:它使维护变得更加简单(IMO)。 (当我说“一致地由相同的字段” 时,我的意思是您可以有一个id1
和id2
字段,通过它们可以定义各个记录及其适用的组。对于您的示例而言,可能太多了,因此,我会一键合并此答案。)
三种方法(下面的数据很远):
dat2a <- merge(dat, groups, by="id", all.x=TRUE)
dat2a
# id int group
# 1 1 22 veh
# 2 2 33 thc1
# 3 3 44 <NA>
请注意,未包含在组定义中的任何id
都将具有NA
组。您可以使用以下方法分配默认组:
dat2a$group[is.na(dat2a$group)] <- "somedefaultgroup"
dat2a
# id int group
# 1 1 22 veh
# 2 2 33 thc1
# 3 3 44 somedefaultgroup
dplyr
,合并/加入概念相似,但使用dplyr
式动词。
library(dplyr)
dat2c <- left_join(dat, groups, by="id") %>%
mutate(group = if_else(is.na(group), "somedefaultgroup", group))
dplyr::case_when
(这不使用我为合并/合并案例定义的groups
。)
如果您真的想对类似if / else的语句做一些阶梯/嵌套,case_when
更易于阅读(和调试),并且可能会更快,具体取决于您的用例。
最直接:
library(dplyr)
dat2b <- dat
dat2b$group <- case_when(
dat2b$id %in% c("1","5") ~ "veh",
dat2b$id %in% c("2","6") ~ "thc1",
TRUE ~ "somedefaultgroup"
)
使用with(...)
比以前更容易阅读,但功能相同。 (如果您的“梯子”长得多,则可以大大减少代码高尔夫球(代码中的字符数)。)
dat2b <- dat
dat2b$group <- with(dat2b, case_when(
id %in% c("1","5") ~ "veh",
id %in% c("2","6") ~ "thc1",
TRUE ~ "somedefaultgroup"
))
如果您想使用一些dplyr
动词,则:
dat2b <- dat
dat2b <- dat2b %>%
mutate(
group = case_when(
id %in% c("1","5") ~ "veh",
id %in% c("2","6") ~ "thc1",
TRUE ~ "somedefaultgroup"
)
)
在执行合并/联接操作时,使用stringsAsFactors=FALSE
很重要,这样就不会出现(新分配的组中的)因子级别不存在的问题。 (可以解决,但是...)
dat <- data.frame(id=c("1","2","3"), int=c(22L,33L,44L),
stringsAsFactors=FALSE)
上述合并示例的可选用法:
groups <- data.frame(id=c("1","5","2","6"), group=c("veh","veh","thc1","thc1"),
stringsAsFactors=FALSE)
groups
# id group
# 1 1 veh
# 2 5 veh
# 3 2 thc1
# 4 6 thc1
前提是您为每个唯一的id
定义一行。
答案 1 :(得分:0)
由于@ r2evans,以下代码完全按照我想要的方式工作(使用dplyr :: case_when)
social.dat$group = case_when(
social.dat$ID %in% c("1","5") ~ "veh",
social.dat$ID %in% c("2","6") ~ "thc1",
social.dat$ID %in% c("3","7") ~ "thc2",
social.dat$ID %in% c("4","8") ~ "thc3"
)
这是数据框的最终输出
# of int. int. not.int. ID group
1 50 218.41 372.16 1 veh
3 33 134.94 158.17 3 thc2