R - 子串到三元组并执行计算

时间:2017-08-03 17:37:40

标签: r

我在R - generate dynamic number of columns and substring column values问了一个类似的问题,但我的问题中的细节现在已经改变了,所以我要重新发布,因为我需要一个不同的解决方案。

我附上了一张图片,说明了我的起始数据集和我想要达到的终点。我需要一个R解决方案,它使用base R作为我将使用的平台无法使用其他软件包。

原始数据集有多列。对于一些列,即L1,L2,L3,我想;

1)根据列中任何字符串的最大长度生成动态数量的列,例如L1最大长度= 6,因此6个新列各自标记为“L1_1”到“L1_6”

2)将原始字符串分隔为子字符串,每个字符串从左侧开始包含3个字符。倒数第二列将包含2个字符,最后一列将包含1个字符。 (与原始问题不同)

3)对这些子串执行计算,即('a'* 1的数量+('b'* 3的数量)+('c'* 7的数量)并返回该计算的值新栏目。

有人对如何做到这一点有任何想法吗?

提前致谢。

dput(original_data):
    structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("",                                                                          "AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,                                                                        
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"),                                                                           L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC",                                                                                     "CACCC", "CBABA"), class = "factor")), .Names = c("ID", "L1",                                                                                      
"L2", "L3"), class = "data.frame", row.names = c(NA, -5L))   

dput(interim_data):
structure(list(ID = 1:5, L1 = structure(c(3L, 2L, 4L, 1L, 5L), .Label = c("",                                                                          
"AAAAAA", "AABBCC", "BBACB", "BCBDAB"), class = "factor"), L2 = structure(c(3L,                                                                        
4L, 3L, 1L, 2L), .Label = c("", "ACAA", "BACA", "BACBA"), class = "factor"),                                                                           
    L3 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "CABAC",                                                                                      
    "CACCC", "CBABA"), class = "factor"), L1_1 = structure(c(3L,                                                                                       
    2L, 4L, 1L, 5L), .Label = c("", "AAA", "AAB", "BBA", "BCB"                                                                                         
    ), class = "factor"), L1_2 = structure(c(3L, 2L, 4L, 1L,                                                                                           
    5L), .Label = c("", "AAA", "ABB", "BAC", "CBD"), class = "factor"),                                                                                
    L1_3 = structure(c(4L, 2L, 3L, 1L, 5L), .Label = c("", "AAA",                                                                                      
    "ACB", "BBC", "BDA"), class = "factor"), L1_4 = structure(c(3L,                                                                                    
    2L, 4L, 1L, 5L), .Label = c("", "AAA", "BCC", "CB", "DAB"                                                                                          
    ), class = "factor"), L1_5 = structure(c(5L, 2L, 4L, 1L,                                                                                           
    3L), .Label = c("", "AA", "AB", "B", "CC"), class = "factor"),                                                                                     
    L1_6 = structure(c(4L, 2L, 1L, 1L, 3L), .Label = c("", "A",                                                                                        
    "B", "C"), class = "factor"), L2_1 = structure(c(3L, 3L,                                                                                           
    3L, 1L, 2L), .Label = c("", "ACA", "BAC"), class = "factor"),                                                                                      
    L2_2 = structure(c(2L, 3L, 2L, 1L, 4L), .Label = c("", "ACA",                                                                                      
    "ACB", "CAA"), class = "factor"), L2_3 = structure(c(3L,                                                                                           
    4L, 3L, 1L, 2L), .Label = c("", "AA", "AC", "CBA"), class = "factor"),                                                                             
    L2_4 = structure(c(2L, 3L, 2L, 1L, 2L), .Label = c("", "A",                                                                                        
    "BA"), class = "factor"), L2_5 = structure(c(1L, 2L, 1L,                                                                                           
    1L, 1L), .Label = c("", "A"), class = "factor"), L3_1 = structure(c(1L,                                                                            
    3L, 2L, 1L, 4L), .Label = c("", "CAB", "CAC", "CBA"), class = "factor"),                                                                           
    L3_2 = structure(c(1L, 3L, 2L, 1L, 4L), .Label = c("", "ABA",                                                                                      
    "ACC", "BAB"), class = "factor"), L3_3 = structure(c(1L,                                                                                           
    4L, 3L, 1L, 2L), .Label = c("", "ABA", "BAC", "CCC"), class = "factor"),                                                                           
    L3_4 = structure(c(1L, 4L, 2L, 1L, 3L), .Label = c("", "AC",                                                                                       
    "BA", "CC"), class = "factor"), L3_5 = structure(c(1L, 3L,                                                                                         
    3L, 1L, 2L), .Label = c("", "A", "C"), class = "factor")), .Names = c("ID",                                                                        
"L1", "L2", "L3", "L1_1", "L1_2", "L1_3", "L1_4", "L1_5", "L1_6",                                                                                      
"L2_1", "L2_2", "L2_3", "L2_4", "L2_5", "L3_1", "L3_2", "L3_3",                                                                                        
"L3_4", "L3_5"), class = "data.frame", row.names = c(NA, -5L))  

Example

编辑: 代码由@Onyambu提供;

interim=sapply(df, as.character)
interim[,1]=as.numeric(interim[,1]
funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

ADD_char=function(x) mapply(funfun,x)
sapply(interim,ADD_char)
dat1 <- cbind(interim[,1:4],sapply(interim[,-(1:4)],ADD_char))

解析错误的结果

2 个答案:

答案 0 :(得分:0)

首先尝试使用此代码之前,请确保检查列的类:sapply(Interim,class)如果上面的代码为您提供了"factor"而不是"character",那么您必须将数据框更改为在characters而不是factors。 只需interim=data.frame(t(t(interim)),stringsAsFactors = F)interim=sapply(Interim, as.character)即可完成工作。然后将ID列更改为numeric。即interim[,1]=as.numeric(interim[,1])。确保数据现在是字符后,您可以运行以下代码:

funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

 ADD_char=function(x) mapply(funfun,x)

 sapply(Interim,ADD_char)

答案 1 :(得分:0)

首先尝试使用此代码之前,请确保检查列的类:sapply(Interim,class)如果上面的代码为您提供了"factor"而不是"character",那么您必须将数据框更改为在characters而不是factors。 只需interim=data.frame(t(t(interim)),stringsAsFactors = F)interim=sapply(Interim, as.character)即可完成工作。然后将ID列更改为numeric。即interim[,1]=as.numeric(interim[,1])。确保数据现在是字符后,您可以运行以下代码:

funfun = function(u){
  if(is.numeric(u)) return(u)
  s = unique(unlist(strsplit(u,"")))
  w = sapply(s,function(x)length(unlist(gregexpr(x,u))))
 ifelse(length(s)>0,sum(w["A"]*1,w["B"]* 3,w["C"]*7,na.rm = T),NA)
}

 ADD_char=function(x) mapply(funfun,x)


 funfun1=function(u){
   if(nchar(u)<2) return(u)
   a = unlist(strsplit(u,""))
   m = length(a)
   if(m>0)
   sapply(1:m, function(i)paste0(a[i:(ifelse(i+2<m,i+2,m))],collapse = ""))
 }  

 funfun2=function(data){
   char_split = function(x) mapply(funfun1,x)
   s = lapply(apply(data,1,char_split),unlist)
   nam = lapply(s,names)
   slen = sapply(nam,length)
   ans=`names<-`(do.call(rbind.data.frame,
              lapply(s,function(i){length(i)=max(slen);i})),
                 nam[[which.max(slen)]])
   ans=data.frame(t(t(ans)),stringsAsFactors=FALSE)
   fn=sapply(ans,function(j) ifelse(is.na(j), "",j))
  as.data.frame(fn,stringsAsFactors=FALSE)
 }

k=funfun2(interim[,1:4])
mapply(class,k)
k[,1]=as.numeric(k[,1])
sapply(k,ADD_char)

     ID L11 L12 L13 L14 L15 L16 L21 L22 L23 L24 L25 L31 L32 L33 L34 L35
[1,]  1   5   7  13  17  14   7  11   9   8   1  NA  NA  NA  NA  NA  NA
[2,]  2   3   3   3   3   2   1  11  11  11   4   1  15  15  21  14   7
[3,]  3   7  11  11  10   3  11   9   8   1  11   5  11   8   7  NA  NA
[4,]  4  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
[5,]  5  13  10   4   4   4   3   9   9   2   1  11   7   5   4   1  NA