如何通过tydr平均复制数据以使用树状图和PCA绘制热图?

时间:2018-04-30 12:33:52

标签: r

我有一个> 100个不同样本的数据集。样品来自不同的基因型(例如X,Y,Z)和4个不同的时间点(T0,1,2,3),具有3个生物学重复(R1,2,3)。我测量了50种不同基因的值(原料中。

   structure(list(Gene = structure(1:2, .Label = c("A", "B"), class = "factor"), X_T0_R1 = c(1.46559502, 0.220140568), X_T0_R2 = c(1.087642983, 0.237500819), X_T0_R3 = c(1.424945196, 0.21066267), X_T1_R1 = c(1.289943948, 0.207778662), X_T1_R2 = c(1.376535013, 0.488774258), X_T1_R3 = c(1.833390311, 0.182798731), X_T2_R1 = c(1.450753714, 0.247576125), X_T2_R2 = c(1.3094609, 0.390028842), X_T2_R3 = c(0.5953716, 1.007079177), X_T3_R1 = c(0.7906009, 0.730242116), X_T3_R2 = c(1.215333041, 1.012914813), X_T3_R3 = c(1.069312467, 0.780421013), Y_T0_R1 = c(0.053317766, 3.316414959), Y_T0_R2 = c(0.506623748, 3.599442788), Y_T0_R3 = c(0.713670106, 2.516735845), Y_T1_R1 = c(0.740998252, 1.444496448), Y_T1_R2 = c(0.648231834, 0.097957459), Y_T1_R3 = c(0.780499252, 0.187840968), Y_T2_R1 = c(0.35344654, 1.190274584), Y_T2_R2 = c(0.220223951, 1.367784148), Y_T2_R3 = c(0.432856978, 1.403057729), Y_T3_R1 = c(0.234963735, 1.232129062), Y_T3_R2 = c(0.353770497, 0.885122768), Y_T3_R3 = c(0.396091395, 1.333921747), Z_T0_R1 = c(0.398000559, 1.286528398), Z_T0_R2 = c(0.384759325, 1.122251177), Z_T0_R3 = c(1.582230097, 0.697419716), Z_T1_R1 = c(1.136843842, 0.804552001), Z_T1_R2 = c(1.275683837, 1.227821594), Z_T1_R3 = c(0.963349308, 0.968589683), Z_T2_R1 = c(3.765036263, 0.477443352), Z_T2_R2 = c(1.901023385, 0.832736132), Z_T2_R3 = c(1.407713024, 0.911920317), Z_T3_R1 = c(0.988333629, 1.095130142), Z_T3_R2 = c(0.618606729, 0.497458337), Z_T3_R3 = c(0.429823986, 0.471389536)), .Names = c("Gene", "X_T0_R1", "X_T0_R2", "X_T0_R3", "X_T1_R1", "X_T1_R2", "X_T1_R3", "X_T2_R1", "X_T2_R2", "X_T2_R3", "X_T3_R1", "X_T3_R2", "X_T3_R3", "Y_T0_R1", "Y_T0_R2", "Y_T0_R3", "Y_T1_R1", "Y_T1_R2", "Y_T1_R3", "Y_T2_R1", "Y_T2_R2", "Y_T2_R3", "Y_T3_R1", "Y_T3_R2", "Y_T3_R3", "Z_T0_R1", "Z_T0_R2", "Z_T0_R3", "Z_T1_R1", "Z_T1_R2", "Z_T1_R3", "Z_T2_R1", "Z_T2_R2", "Z_T2_R3", "Z_T3_R1", "Z_T3_R2", "Z_T3_R3"), class = "data.frame", row.names = c(NA, -2L))

我希望每个时间点(T0,1,3,5)平均达到3次重复(R1,2,3)并制作一个新矩阵,然后使用树状图创建热图。

如何在特定时间点平均每个基因型的3个重复并制作新矩阵?用tydr可以吗?

2 个答案:

答案 0 :(得分:0)

我不完全确定你想要什么。但是对于争论数据,dplyr + tidyr做得很好。

library(dplyr)
library(tidyr)
df1 <- df %>%
  gather(var, value, -Gene) %>%
  separate(var, "_", into = c("genotypes", "time", "replicates")) %>%
  group_by(replicates, time) %>%
  summarise(value = mean(value)) %>%
  spread(replicates, value)
df1

# A# A tibble: 4 x 4
# A  time     R1    R2    R3
# A  <chr> <dbl> <dbl> <dbl>
# A1 T0    1.12  1.16  1.19 
# A2 T1    0.937 0.853 0.819
# A3 T2    1.25  1.00  0.960
# A4 T3    0.845 0.764 0.747

热图可以通过以下方式轻松完成:

df2 <- as.matrix(df1[,-1])
rownames(df2) <- df1$time
heatmap(df2)

修改

以下是df

df <- structure(list(Gene = structure(1:2, .Label = c("A", "B"), class = "factor"), X_T0_R1 = c(1.46559502, 0.220140568), X_T0_R2 = c(1.087642983, 0.237500819), X_T0_R3 = c(1.424945196, 0.21066267), X_T1_R1 = c(1.289943948, 0.207778662), X_T1_R2 = c(1.376535013, 0.488774258), X_T1_R3 = c(1.833390311, 0.182798731), X_T2_R1 = c(1.450753714, 0.247576125), X_T2_R2 = c(1.3094609, 0.390028842), X_T2_R3 = c(0.5953716, 1.007079177), X_T3_R1 = c(0.7906009, 0.730242116), X_T3_R2 = c(1.215333041, 1.012914813), X_T3_R3 = c(1.069312467, 0.780421013), Y_T0_R1 = c(0.053317766, 3.316414959), Y_T0_R2 = c(0.506623748, 3.599442788), Y_T0_R3 = c(0.713670106, 2.516735845), Y_T1_R1 = c(0.740998252, 1.444496448), Y_T1_R2 = c(0.648231834, 0.097957459), Y_T1_R3 = c(0.780499252, 0.187840968), Y_T2_R1 = c(0.35344654, 1.190274584), Y_T2_R2 = c(0.220223951, 1.367784148), Y_T2_R3 = c(0.432856978, 1.403057729), Y_T3_R1 = c(0.234963735, 1.232129062), Y_T3_R2 = c(0.353770497, 0.885122768), Y_T3_R3 = c(0.396091395, 1.333921747), Z_T0_R1 = c(0.398000559, 1.286528398), Z_T0_R2 = c(0.384759325, 1.122251177), Z_T0_R3 = c(1.582230097, 0.697419716), Z_T1_R1 = c(1.136843842, 0.804552001), Z_T1_R2 = c(1.275683837, 1.227821594), Z_T1_R3 = c(0.963349308, 0.968589683), Z_T2_R1 = c(3.765036263, 0.477443352), Z_T2_R2 = c(1.901023385, 0.832736132), Z_T2_R3 = c(1.407713024, 0.911920317), Z_T3_R1 = c(0.988333629, 1.095130142), Z_T3_R2 = c(0.618606729, 0.497458337), Z_T3_R3 = c(0.429823986, 0.471389536)), .Names = c("Gene", "X_T0_R1", "X_T0_R2", "X_T0_R3", "X_T1_R1", "X_T1_R2", "X_T1_R3", "X_T2_R1", "X_T2_R2", "X_T2_R3", "X_T3_R1", "X_T3_R2", "X_T3_R3", "Y_T0_R1", "Y_T0_R2", "Y_T0_R3", "Y_T1_R1", "Y_T1_R2", "Y_T1_R3", "Y_T2_R1", "Y_T2_R2", "Y_T2_R3", "Y_T3_R1", "Y_T3_R2", "Y_T3_R3", "Z_T0_R1", "Z_T0_R2", "Z_T0_R3", "Z_T1_R1", "Z_T1_R2", "Z_T1_R3", "Z_T2_R1", "Z_T2_R2", "Z_T2_R3", "Z_T3_R1", "Z_T3_R2", "Z_T3_R3"), class = "data.frame", row.names = c(NA, -2L))

enter image description here

答案 1 :(得分:0)

我会将数据转换为长格式,提取基因型和时间点信息,然后根据它进行平均。

library(data.table)

# Transform to long format
dataMod <- setDT(melt(dataOrig))
# head(dataMod)
#   Gene variable     value
# 1:    A  X_T0_R1 1.4655950
# 2:    B  X_T0_R1 0.2201406
# 3:    A  X_T0_R2 1.0876430
# 4:    B  X_T0_R2 0.2375008
# 5:    A  X_T0_R3 1.4249452
# 6:    B  X_T0_R3 0.2106627

# Extract sample information
dataMod[, genotype := sub("_.*", "", variable)]
dataMod[, replicate := sub(".*_", "", variable)]
dataMod[, timePoint := sub(".*_(.*)_.*", "\\1", variable)]
# head(dataMod)
#   Gene variable     value genotype replicate timePoint
# 1:    A  X_T0_R1 1.4655950        X        R1        T0
# 2:    B  X_T0_R1 0.2201406        X        R1        T0
# 3:    A  X_T0_R2 1.0876430        X        R2        T0
# 4:    B  X_T0_R2 0.2375008        X        R2        T0
# 5:    A  X_T0_R3 1.4249452        X        R3        T0
# 6:    B  X_T0_R3 0.2106627        X        R3        T0

# Average
dataAve <- dataMod[, mean(value), .(Gene, genotype, timePoint)]
# head(dataAve)
#    Gene genotype timePoint        V1
# 1:    A        X        T0 1.3260611
# 2:    B        X        T0 0.2227680
# 3:    A        X        T1 1.4999564
# 4:    B        X        T1 0.2931172
# 5:    A        X        T2 1.1185287
# 6:    B        X        T2 0.5482280

或者您可以通过以下方式恢复原始广告格式:

dcast(dataAve, Gene ~ genotype + timePoint, value.var = "V1")
#    Gene     X_T0      X_T1     X_T2      X_T3
# 1:    A 1.326061 1.4999564 1.118529 1.0250821
# 2:    B 0.222768 0.2931172 0.548228 0.8411926

使用这样的数据(宽格式),您可以执行pca或其他分析:

foo <- dcast(dataAve, Gene ~ genotype + timePoint, value.var = "V1")
# -1 to remove gene info
bar <- prcomp(t(foo[, -1 , with = FALSE]))