使用表达式在数据框中处理列的子集

时间:2014-08-14 07:13:28

标签: regex r replace subset

假设我有一个这样的数据框:

C1     C2           C3        C4
1      rs1009434    172.965   168635004
1      rs1009941    15.446    9275194
1      rs10127622   .         151468060
1      rs10157045   25.4653   14575084
1      rs10158159   71.1698   48006898
1      rs1020863    .         61129957
1      rs1024268    172.965   70117404
1      rs1025653    224.778   217081526
1      rs1028180    174.272   169345868
1      rs10437097   70.7848   47856611
1      rs10493980   172.965   102540084
1      rs10495107   .         218970504
1      rs10495306   238.74    231383173

我想让C4中的数字与C3中的点和重复相对应,为负数。为此,我所做的是,

t1=subset(df,C3=='.'|duplicated(C3))
t2=subset(df,!(C3=='.'|duplicated(C3)))

然后,

t1$C4=t1$C4*(-1)

df.new=rbind(t1,t2)

所以现在我得到了

    C1     C2           C3        C4
1      rs1009434    172.965   168635004
1      rs1009941    15.446    9275194
1      rs10127622   .         -151468060
1      rs10157045   25.4653   14575084
1      rs10158159   71.1698   48006898
1      rs1020863    .         -61129957
1      rs1024268    172.965   -70117404
1      rs1025653    224.778   217081526
1      rs1028180    174.272   169345868
1      rs10437097   70.7848   47856611
1      rs10493980   172.965   -102540084
1      rs10495107   .         -218970504
1      rs10495306   238.74    231383173

我的问题是,有没有其他简单的方法可以做到这一点?

3 个答案:

答案 0 :(得分:1)

indx <-  with(df, grepl('"^(\\s+)?\\.(\\s+)?$', C3)|duplicated(C3))

#or
library(stringr)
 df$C3 <- str_trim(df$C3)# just in case you have trailing/leading spaces
indx <-  with(df, C3=="."|duplicated(C3)) 
#For example

# grepl("^(\\s+)?\\.(\\s+)?$", c(" .", "." , " .", "3.5453 "))
#[1]  TRUE  TRUE  TRUE FALSE

# str_trim(c(" .", "." , " .", "3.5453 "))=="."
 #[1]  TRUE  TRUE  TRUE FALSE



df$C4[indx] <- -1*df$C4[indx] 

df
#   C1         C2      C3         C4
#1   1  rs1009434 172.965  168635004
#2   1  rs1009941  15.446    9275194
#3   1 rs10127622       . -151468060
#4   1 rs10157045 25.4653   14575084
#5   1 rs10158159 71.1698   48006898
#6   1  rs1020863       .  -61129957
#7   1  rs1024268 172.965  -70117404
#8   1  rs1025653 224.778  217081526
#9   1  rs1028180 174.272  169345868
#10  1 rs10437097 70.7848   47856611
#11  1 rs10493980 172.965 -102540084
#12  1 rs10495107       . -218970504
#13  1 rs10495306  238.74  231383173

答案 1 :(得分:1)

tt = structure(list(V1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
+ 1L, 1L, 1L), V2 = structure(1:13, .Label = c("rs1009434", "rs1009941", 
+ "rs10127622", "rs10157045", "rs10158159", "rs1020863", "rs1024268", 
+ "rs1025653", "rs1028180", "rs10437097", "rs10493980", "rs10495107", 
+ "rs10495306"), class = "factor"), V3 = structure(c(3L, 2L, 1L, 
+ 7L, 9L, 1L, 3L, 5L, 4L, 8L, 3L, 1L, 6L), .Label = c(".", "15.446", 
+ "172.965", "174.272", "224.778", "238.74", "25.4653", "70.7848", 
+ "71.1698"), class = "factor"), V4 = c(168635004L, 9275194L, 151468060L, 
+ 14575084L, 48006898L, 61129957L, 70117404L, 217081526L, 169345868L, 
+ 47856611L, 102540084L, 218970504L, 231383173L)), .Names = c("V1", 
+ "V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -13L
+ ))
> tt
   V1         V2      V3        V4
1   1  rs1009434 172.965 168635004
2   1  rs1009941  15.446   9275194
3   1 rs10127622       . 151468060
4   1 rs10157045 25.4653  14575084
5   1 rs10158159 71.1698  48006898
6   1  rs1020863       .  61129957
7   1  rs1024268 172.965  70117404
8   1  rs1025653 224.778 217081526
9   1  rs1028180 174.272 169345868
10  1 rs10437097 70.7848  47856611
11  1 rs10493980 172.965 102540084
12  1 rs10495107       . 218970504
13  1 rs10495306  238.74 231383173
> idx = which(duplicated(tt$V3) | tt$V3=='.')
> tt[idx,4] = tt[idx,4]*(-1)
> tt
   V1         V2      V3         V4
1   1  rs1009434 172.965  168635004
2   1  rs1009941  15.446    9275194
3   1 rs10127622       . -151468060
4   1 rs10157045 25.4653   14575084
5   1 rs10158159 71.1698   48006898
6   1  rs1020863       .  -61129957
7   1  rs1024268 172.965  -70117404
8   1  rs1025653 224.778  217081526
9   1  rs1028180 174.272  169345868
10  1 rs10437097 70.7848   47856611
11  1 rs10493980 172.965 -102540084
12  1 rs10495107       . -218970504
13  1 rs10495306  238.74  231383173
> 

答案 2 :(得分:0)

import re
print re.sub(r"(?=\d\s+\S+\s+\.\s+\d+)(.*?\.\s+)(\d+)",r"\1-",x)

在Python中,可以使用正则表达式完成。取代