我在R - generate dynamic number of columns and substring column values问了一个类似的问题,但我的问题中的细节现在已经改变了,所以我要重新发布,因为我需要一个不同的解决方案。
我附上了一张图片,说明了我的起始数据集和我想要达到的终点。我需要一个R解决方案,它使用base R作为我将使用的平台无法使用其他软件包。
原始数据集有多列。对于一些列,即L1,L2,L3,我想;
1)根据列中任何字符串的最大长度生成动态数量的列,例如L1最大长度= 6,因此6个新列各自标记为“L1_1”到“L1_6”
2)将原始字符串分隔为子字符串,每个字符串从左侧开始包含3个字符。倒数第二列将包含2个字符,最后一列将包含1个字符。 (与原始问题不同)
3)对这些子串执行计算,即('a'* 1的数量+('b'* 3的数量)+('c'* 7的数量)并返回该计算的值新栏目。
有人对如何做到这一点有任何想法吗?
提前致谢。
dput(original_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("", "AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"), L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC", "CACCC", "CBABA"), class = "factor")), .Names = c("ID", "L1",
"L2", "L3"), class = "data.frame", row.names = c(NA, -5L))
dput(interim_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("",
"AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"),
L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC",
"CACCC", "CBABA"), class = "factor"), L1_1 = structure(c(3L,
2L, 4L, 1L, 5L), .Label = c("", "AAA", "AAB", "BBA", "BCB"
), class = "factor"), L1_2 = structure(c(3L, 2L, 4L, 1L,
5L), .Label = c("", "AAA", "ABB", "BAC", "CBD"), class = "factor"),
L1_3 = structure(c(4L, 2L, 3L, 1L, 5L), .Label = c("", "AAA",
"ACB", "BBC", "BDA"), class = "factor"), L1_4 = structure(c(3L,
2L, 4L, 1L, 5L), .Label = c("", "AAA", "BCC", "CB", "DAB"
), class = "factor"), L1_5 = structure(c(5L, 2L, 4L, 1L,
3L), .Label = c("", "AA", "AB", "B", "CC"), class = "factor"),
L1_6 = structure(c(4L, 2L, 1L, 1L, 3L), .Label = c("", "A",
"B", "C"), class = "factor"), L2_1 = structure(c(3L, 3L,
3L, 1L, 2L), .Label = c("", "ACA", "BAC"), class = "factor"),
L2_2 = structure(c(2L, 3L, 2L, 1L, 4L), .Label = c("", "ACA",
"ACB", "CAA"), class = "factor"), L2_3 = structure(c(3L,
4L, 3L, 1L, 2L), .Label = c("", "AA", "AC", "CBA"), class = "factor"),
L2_4 = structure(c(2L, 3L, 2L, 1L, 2L), .Label = c("", "A",
"BA"), class = "factor"), L2_5 = structure(c(1L, 2L, 1L,
1L, 1L), .Label = c("", "A"), class = "factor"), L3_1 = structure(c(1L,
3L, 2L, 1L, 4L), .Label = c("", "CAB", "CAC", "CBA"), class = "factor"),
L3_2 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "ABA",
"ACC", "BAB"), class = "factor"), L3_3 = structure(c(1L,
4L, 3L, 1L, 2L), .Label = c("", "ABA", "BAC", "CCC"), class = "factor"),
L3_4 = structure(c(1L, 4L, 2L, 1L, 3L), .Label = c("", "AC",
"BA", "CC"), class = "factor"), L3_5 = structure(c(1L, 3L,
3L, 1L, 2L), .Label = c("", "A", "C"), class = "factor")), .Names = c("ID",
"L1", "L2", "L3", "L1_1", "L1_2", "L1_3", "L1_4", "L1_5", "L1_6",
"L2_1", "L2_2", "L2_3", "L2_4", "L2_5", "L3_1", "L3_2", "L3_3",
"L3_4", "L3_5"), class = "data.frame", row.names = c(NA, -5L))
编辑: 代码由@Onyambu提供;
interim=sapply(df, as.character)
interim[,1]=as.numeric(interim[,1]
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
sapply(interim,ADD_char)
dat1 <- cbind(interim[,1:4],sapply(interim[,-(1:4)],ADD_char))
解析错误的结果
答案 0 :(得分:0)
首先尝试使用此代码之前,请确保检查列的类:sapply(Interim,class)
如果上面的代码为您提供了"factor"
而不是"character"
,那么您必须将数据框更改为在characters
而不是factors
。
只需interim=data.frame(t(t(interim)),stringsAsFactors = F)
或interim=sapply(Interim, as.character)
即可完成工作。然后将ID
列更改为numeric
。即interim[,1]=as.numeric(interim[,1])
。确保数据现在是字符后,您可以运行以下代码:
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
sapply(Interim,ADD_char)
答案 1 :(得分:0)
首先尝试使用此代码之前,请确保检查列的类:sapply(Interim,class)
如果上面的代码为您提供了"factor"
而不是"character"
,那么您必须将数据框更改为在characters
而不是factors
。
只需interim=data.frame(t(t(interim)),stringsAsFactors = F)
或interim=sapply(Interim, as.character)
即可完成工作。然后将ID
列更改为numeric
。即interim[,1]=as.numeric(interim[,1])
。确保数据现在是字符后,您可以运行以下代码:
funfun = function(u){
if(is.numeric(u)) return(u)
s = unique(unlist(strsplit(u,"")))
w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}
ADD_char=function(x) mapply(funfun,x)
funfun1=function(u){
if(nchar(u)<2) return(u)
a = unlist(strsplit(u,""))
m = length(a)
if(m>0)
sapply(1:m, function(i)paste0(a[i:(ifelse(i+2<m,i+2,m))],collapse = ""))
}
funfun2=function(data){
char_split = function(x) mapply(funfun1,x)
s = lapply(apply(data,1,char_split),unlist)
nam = lapply(s,names)
slen = sapply(nam,length)
ans=`names<-`(do.call(rbind.data.frame,
lapply(s,function(i){length(i)=max(slen);i})),
nam[[which.max(slen)]])
ans=data.frame(t(t(ans)),stringsAsFactors=FALSE)
fn=sapply(ans,function(j) ifelse(is.na(j), "",j))
as.data.frame(fn,stringsAsFactors=FALSE)
}
k=funfun2(interim[,1:4])
mapply(class,k)
k[,1]=as.numeric(k[,1])
sapply(k,ADD_char)
ID L11 L12 L13 L14 L15 L16 L21 L22 L23 L24 L25 L31 L32 L33 L34 L35
[1,] 1 5 7 13 17 14 7 11 9 8 1 NA NA NA NA NA NA
[2,] 2 3 3 3 3 2 1 11 11 11 4 1 15 15 21 14 7
[3,] 3 7 11 11 10 3 11 9 8 1 11 5 11 8 7 NA NA
[4,] 4 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[5,] 5 13 10 4 4 4 3 9 9 2 1 11 7 5 4 1 NA