Question

我的数据格式不是很宽，大多数变量都有自己的列，但后两个（point_type和point_value）很长。像这样：

strain light hours point_type point_value
------ ----- ----- ---------- -----------
blank  25    17    log_od750  -0.9018948
blank  25    17    log_od750  -2.0778428
blank  25    17    log_od750  -1.6129440
blank  25    17    log_od750  -2.0076792
blank  25    17    log_od750  -2.8913722
blank  25    17    log_od750  -2.4292827

如何使用几种特定类型的点提取宽幅数据框？例如，我想要列strain，light，hours，od750和log_od750，并删除任何其他point_type。我认为最简单方法是首先像data <- subset(data, point_type %in% c('od750', 'log_od750'))这样的子集，然后再重塑？

我尝试了各种dcast公式但却没有真正得到我正在做的事情。我是否需要首先melt完成剩下的工作？如果是这样，我如何防止重复聚合重复？

我意识到那里有很多重塑问题和答案！但是我整个上午都无法从他们身上弄清楚，我甚至不确定哪些与我的问题有关。

编辑：更好的样本数据。

set.seed(1)
data <- data.frame(
  strain=sample(paste0('strain', 1:10), 100, replace=TRUE),
  light=sample(100),
  hours=sample(100),
  point_type=sample(c('od750', 'log_od750', 'loess', 'loess_err',
                      'locfit', 'locfit_err'), 100, replace=TRUE),
  point_value=sample(100)
)

（我要重新整形的列将具有匹配的hours值，我只是不确定如何编写一个sample调用来证明这一点。）

编辑：eipi10的答案几乎可以解决，但我认为我还需要通过添加ID列来解决无法在dcast中保留重复值的问题，如其他几个问题所示：

但在阅读+尝试这些例子后，我仍然不确定如何构建该列。

编辑：基于eipi10答案的工作解决方案和this one：

cast_points <- function(df, ptype_regex) {
  # takes a data frame in "long-ish" format
  # (identifying columns + point_type, point_value),
  # and extracts a wide-format data frame
  # note: no aggregation is done, which makes sense if your data contains
  #       lots of replicate measurements you want to keep separate
  # note: this also works best when you have values for each chosen point_type
  #       for each combination of other columns. if not, you get lots of NAs
  idcols <- colnames(df)[!colnames(df) %in% c('point_type', 'point_value')]
  ptypes <- unique(grep(ptype_regex, df[['point_type']], value=TRUE))
  gbdots <- lapply(c(idcols, 'point_type'), as.symbol)
  dcform <- paste(paste(c(idcols, 'counter'), collapse='+'), '~', 'point_type')
  df <- df[df[['point_type']] %in% ptypes,]
  df <- df %>%
    group_by_(.dots=gbdots) %>%
    mutate(counter=1:n()) %>%
    dcast(formula=dcform, value.var="point_value", drop=TRUE)
  df[['counter']] <- NULL
  return(df)
}

head(data)
#   strain resistance light  light_bin hours point_type point_value
# 1  blank       <NA>    25 025-025 uE    17      od750      0.4058
# 2  blank       <NA>    25 025-025 uE    17      od750      0.1252
# 3  blank       <NA>    25 025-025 uE    17      od750      0.1993
# 4  blank       <NA>    25 025-025 uE    17      od750      0.1343
# 5  blank       <NA>    25 025-025 uE    17      od750      0.0555
# 6  blank       <NA>    25 025-025 uE    17      od750      0.0881
tail(data)
#        strain resistance light  light_bin hours point_type point_value
# 93516 strain1         Km    25 025-025 uE    17  log_od750  -0.9670579
# 93517 strain1         Km    25 025-025 uE    17  log_od750  -1.4605870
# 93540 strain1         Km    25 025-025 uE    17  log_od750  -1.8300846
# 93542 strain1         Km    25 025-025 uE    17  log_od750  -1.1779802
# 93554 strain1         Km    25 025-025 uE    17  log_od750  -2.0448469
# 93556 strain1         Km    25 025-025 uE    17  log_od750  -1.8413700

head(cast_points(data, 'log'))
#   strain resistance light  light_bin hours  log_od750
# 1  blank       <NA>    25 025-025 uE    17 -0.9018948
# 2  blank       <NA>    25 025-025 uE    17 -2.0778428
# 3  blank       <NA>    25 025-025 uE    17 -1.6129440
# 4  blank       <NA>    25 025-025 uE    17 -2.0076792
# 5  blank       <NA>    25 025-025 uE    17 -2.8913722
# 6  blank       <NA>    25 025-025 uE    17 -2.4292827

head(cast_points(data, 'od'))
#   strain resistance light  light_bin hours  log_od750  od750
# 1  blank       <NA>    25 025-025 uE    17 -0.9018948 0.4058
# 2  blank       <NA>    25 025-025 uE    17 -2.0778428 0.1252
# 3  blank       <NA>    25 025-025 uE    17 -1.6129440 0.1993
# 4  blank       <NA>    25 025-025 uE    17 -2.0076792 0.1343
# 5  blank       <NA>    25 025-025 uE    17 -2.8913722 0.0555
# 6  blank       <NA>    25 025-025 uE    17 -2.4292827 0.0881

编辑：如果其他人到目前为止，这里有一些其他功能来操纵相同类型的数据：

remove_points <- function(df, ptype_regex) {
  # takes a data frame in "long-ish" format
  # (identifying columns + point_type, point_value),
  # and removes rows with point_types matching the regex
  # useful for removing points that you just extracted with cast_points
  ptypes <- unique(grep(ptype_regex, df[['point_type']], value=TRUE))
  df[!df[['point_type']] %in% ptypes,]
}

append_points <- function(df1_longish, df2_wide) {
  # takes one main data frame in "long-ish" format and a second in wide format
  # melts the wide one into long-ish and appends to the main one
  idcols <- colnames(df1_longish)[!colnames(df1_longish)
                                  %in% c('point_type', 'point_value')]
  df2_longish <- melt(df2, variable.name='point_type',
                              value.name='point_value', id.vars=idcols)
  rbind(df1_longish, df2_longish)
}

sac_points <- function(df, fn, ptype_regex) {
  # split-apply-combine for "long-ish" data
  misc <- drop_points(df, ptype_regex)
  df   <- cast_points(df, ptype_regex)
  df   <- fn(df)
  append_points(misc, df)
}

Answer 1

您似乎有两个不同的问题：您只想保留point_type的某些级别，然后您希望point_type的级别成为列。如果是这样，你可以这样做：

library(reshape2)

# Subset
data.keep = data[grep("od750|loc", data$point_type), ]

# Convert to wide format
data.keep.wide = dcast(data.keep, strain + light + hours ~ point_type, 
                       value.var="point_value")

第一行代码将保留point_type包含文本＆＃34; od750＆＃34;的所有行。或＆＃34; loc＆＃34; （我只是选择这些来说明）。第二行将剩余的point_type级别转换为列。

这里是结果的前几行：

data.keep.wide

     strain light hours locfit locfit_err log_od750 od750
1   strain1    24    48     NA         60        NA    NA
2   strain1    67    74     16         NA        NA    NA
3   strain1    78    20      7         NA        NA    NA
4   strain1    83     5     NA         NA        58    NA
5   strain1    95    84     NA         NA        NA    47
6  strain10    13    53     NA         NA        35    NA
7  strain10    42    90     NA         NA        NA    78

请注意，在每一行中，只有一个新列包含一个值。根据您问题中的信息，我假设您的实际数据每小时point_value的每个级别都有point_type。

更新：您的评论似乎表明您的数据可以为菌株，光照，小时和point_type的每个唯一组合提供多行。如果是这种情况，并且您希望保留所有这些，那么您可以在进行子集化后执行以下操作。下面的代码按所有相关列对数据进行分组，然后添加counter，它提供了唯一的＆＃34; ID＆＃34;到给定组的每一行。然后我们在counter的LHS上包含dcast以保留所有行，而不会聚合：

library(reshape2)
library(dplyr)

# Convert to wide format, keeping all rows
data.keep.wide = data.keep %>%
  group_by(strain, light, hours, point_type) %>%
  mutate(counter=1:n()) %>%
  dcast(strain + light + hours + counter ~ point_type, value.var="point_value")

如何在R中将“long-ish”数据转换为宽格式，删除一些宽列变量的某些值？

1 个答案: