我有这种格式的csv:
Col1_Status Col1_Value Col2_Status Col2_Value Col3_Status Col3__Value
LOW 5 HIGH 5 LOW 5
LOW 8 HIGH 8 LOW 8
HIGH 82 HIGH 8 LOW 7
HIGH 83 NORMAL 8 LOW 7
HIGH 82 NORMAL 8 LOW 7
我想创建一个包含高和低列的新数据框,例如:
Col1_High Col1_Low Col2_High Col2_Low Col3_High Col3_Low
82 5 5 NA NA 5
83 8 8 NA NA 8
82 NA 8 NA NA 7
NA NA NA NA NA 7
NA NA NA NA NA 7
最好的方法是什么?
到目前为止,我认为:
#extract the Status Columns from original file into DataFrame
statusDF <- ret[grepl("Status", colnames(ret))]
#extract the Value Columns from original file into DataFrame
originalValueDF <- ret[grepl("Value", colnames(ret))]
#create new columns attribute_high and attribute_low
for(i in names(originalValueDF)){
newValueDF <- originalValueDF[[paste(i, 'High', sep = "_")]]
newValueDF <- originalValueDF[[paste(i, 'Low', sep = "_")]]
}
#populate both columns based on value in attribute status column
for(i in names(originalValueDF)){
if (originalValueDF$i == "High"){
temp <- # stuck here
}
}
任何建议表示赞赏
答案 0 :(得分:1)
以下是大量lapply
的尝试。我们首先创建一个列表(l1
),其中列出了每个&#39; High&#39;和&#39;低&#39;状态。但是,这些向量的长度是不同的,因此我们需要将它们全部设置为等于它们的最大值(在我们的例子中为ind
)。我们将向量转换为具有2列(高和低)的矩阵,并使用do.call
和cbind
来获取最终的数据帧。
l1 <- lapply(seq(1, ncol(df), by = 2), function(i) list(HIGH = df[i+1][df[i] == 'HIGH'],
LOW = df[i+1][df[i] == 'LOW']))
names(l1) <- paste0('Col', seq(length(l1)))
ind <- max(unlist(lapply(l1, function(i) lengths(i))))
do.call(cbind, lapply(lapply(l1, function(i) lapply(i, `length<-`, ind)), function(j)
setNames(data.frame(matrix(unlist(j), ncol = 2)), c('High', 'Low'))))
# Col1.High Col1.Low Col2.High Col2.Low Col3.High Col3.Low
#1 82 5 5 NA NA 5
#2 83 8 8 NA NA 8
#3 82 NA 8 NA NA 7
#4 NA NA NA NA NA 7
#5 NA NA NA NA NA 7
答案 1 :(得分:0)
ret <- read.table(text="
Col1_Status Col1_Value Col2_Status Col2_Value Col3_Status Col3__Value
LOW 5 HIGH 5 LOW 5
LOW 8 HIGH 8 LOW 8
HIGH 82 HIGH 8 LOW 7
HIGH 83 NORMAL 8 LOW 7
HIGH 82 NORMAL 8 LOW 7
", header = TRUE, stringsAsFactors = F)
# fix column headers
names(ret) <- gsub("(_+)", "_", names(ret))
library(stats)
# extract the column prefixes
prefixes <- unique(gsub("_.+", "", names(ret)))
value_names <- names(ret[grepl("_Value", names(ret))])
status_names <- names(ret[grepl("_Status", names(ret))])
library(stats)
# get the lwo values - extract the lows, pad with NA's and set the name to _High
high_values <- sapply(1:length(prefixes),
function(i) {
result <- ret[which(ret[, status_names][i] == "HIGH"), value_names][[i]]
result[(length(result)+1):nrow(ret)+1] <- NA
setNames(list(foo = result[1:nrow(ret)]), paste0(prefixes[i], "_High"))})
# get the lwo values - extract the lows, pad with NA's and set the name to _Low
low_values <- sapply(1:length(prefixes),
function(i) {
result <- ret[which(ret[, status_names][i] == "LOW"), value_names][[i]]
result[(length(result)+1):nrow(ret)+1] <- NA
setNames(list(foo = result[1:nrow(ret)]), paste0(prefixes[i], "_Low"))})
# combine
output <- cbind(data.frame(low_values), data.frame(high_values))
output
# Col1_Low Col2_Low Col3_Low Col1_High Col2_High Col3_High
# 1 5 NA 5 82 5 NA
# 2 8 NA 8 83 8 NA
# 3 NA NA 7 82 8 NA
# 4 NA NA 7 NA NA NA
# 5 NA NA 7 NA NA NA