假设我有一个这样的数据框:
C1 C2 C3 C4
1 rs1009434 172.965 168635004
1 rs1009941 15.446 9275194
1 rs10127622 . 151468060
1 rs10157045 25.4653 14575084
1 rs10158159 71.1698 48006898
1 rs1020863 . 61129957
1 rs1024268 172.965 70117404
1 rs1025653 224.778 217081526
1 rs1028180 174.272 169345868
1 rs10437097 70.7848 47856611
1 rs10493980 172.965 102540084
1 rs10495107 . 218970504
1 rs10495306 238.74 231383173
我想让C4中的数字与C3中的点和重复相对应,为负数。为此,我所做的是,
t1=subset(df,C3=='.'|duplicated(C3))
t2=subset(df,!(C3=='.'|duplicated(C3)))
然后,
t1$C4=t1$C4*(-1)
df.new=rbind(t1,t2)
所以现在我得到了
C1 C2 C3 C4
1 rs1009434 172.965 168635004
1 rs1009941 15.446 9275194
1 rs10127622 . -151468060
1 rs10157045 25.4653 14575084
1 rs10158159 71.1698 48006898
1 rs1020863 . -61129957
1 rs1024268 172.965 -70117404
1 rs1025653 224.778 217081526
1 rs1028180 174.272 169345868
1 rs10437097 70.7848 47856611
1 rs10493980 172.965 -102540084
1 rs10495107 . -218970504
1 rs10495306 238.74 231383173
我的问题是,有没有其他简单的方法可以做到这一点?
答案 0 :(得分:1)
indx <- with(df, grepl('"^(\\s+)?\\.(\\s+)?$', C3)|duplicated(C3))
#or
library(stringr)
df$C3 <- str_trim(df$C3)# just in case you have trailing/leading spaces
indx <- with(df, C3=="."|duplicated(C3))
#For example
# grepl("^(\\s+)?\\.(\\s+)?$", c(" .", "." , " .", "3.5453 "))
#[1] TRUE TRUE TRUE FALSE
# str_trim(c(" .", "." , " .", "3.5453 "))=="."
#[1] TRUE TRUE TRUE FALSE
df$C4[indx] <- -1*df$C4[indx]
df
# C1 C2 C3 C4
#1 1 rs1009434 172.965 168635004
#2 1 rs1009941 15.446 9275194
#3 1 rs10127622 . -151468060
#4 1 rs10157045 25.4653 14575084
#5 1 rs10158159 71.1698 48006898
#6 1 rs1020863 . -61129957
#7 1 rs1024268 172.965 -70117404
#8 1 rs1025653 224.778 217081526
#9 1 rs1028180 174.272 169345868
#10 1 rs10437097 70.7848 47856611
#11 1 rs10493980 172.965 -102540084
#12 1 rs10495107 . -218970504
#13 1 rs10495306 238.74 231383173
答案 1 :(得分:1)
tt = structure(list(V1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
+ 1L, 1L, 1L), V2 = structure(1:13, .Label = c("rs1009434", "rs1009941",
+ "rs10127622", "rs10157045", "rs10158159", "rs1020863", "rs1024268",
+ "rs1025653", "rs1028180", "rs10437097", "rs10493980", "rs10495107",
+ "rs10495306"), class = "factor"), V3 = structure(c(3L, 2L, 1L,
+ 7L, 9L, 1L, 3L, 5L, 4L, 8L, 3L, 1L, 6L), .Label = c(".", "15.446",
+ "172.965", "174.272", "224.778", "238.74", "25.4653", "70.7848",
+ "71.1698"), class = "factor"), V4 = c(168635004L, 9275194L, 151468060L,
+ 14575084L, 48006898L, 61129957L, 70117404L, 217081526L, 169345868L,
+ 47856611L, 102540084L, 218970504L, 231383173L)), .Names = c("V1",
+ "V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -13L
+ ))
> tt
V1 V2 V3 V4
1 1 rs1009434 172.965 168635004
2 1 rs1009941 15.446 9275194
3 1 rs10127622 . 151468060
4 1 rs10157045 25.4653 14575084
5 1 rs10158159 71.1698 48006898
6 1 rs1020863 . 61129957
7 1 rs1024268 172.965 70117404
8 1 rs1025653 224.778 217081526
9 1 rs1028180 174.272 169345868
10 1 rs10437097 70.7848 47856611
11 1 rs10493980 172.965 102540084
12 1 rs10495107 . 218970504
13 1 rs10495306 238.74 231383173
> idx = which(duplicated(tt$V3) | tt$V3=='.')
> tt[idx,4] = tt[idx,4]*(-1)
> tt
V1 V2 V3 V4
1 1 rs1009434 172.965 168635004
2 1 rs1009941 15.446 9275194
3 1 rs10127622 . -151468060
4 1 rs10157045 25.4653 14575084
5 1 rs10158159 71.1698 48006898
6 1 rs1020863 . -61129957
7 1 rs1024268 172.965 -70117404
8 1 rs1025653 224.778 217081526
9 1 rs1028180 174.272 169345868
10 1 rs10437097 70.7848 47856611
11 1 rs10493980 172.965 -102540084
12 1 rs10495107 . -218970504
13 1 rs10495306 238.74 231383173
>
答案 2 :(得分:0)
import re
print re.sub(r"(?=\d\s+\S+\s+\.\s+\d+)(.*?\.\s+)(\d+)",r"\1-",x)
在Python中,可以使用正则表达式完成。取代