Question

我的dcast Rcodes不再运行了。我有这里讨论的问题：segfault in R using reshape2 package and dcast

该错误尚未修复，因此我正在寻找其他方法来实现我的dcast输出。任何建议将不胜感激！

在我的数据集的非常小的输入下面。基本上，每个调查ID（“EID”）每个物种有一个条目。我希望每个调查ID（“EID”）获得一个条目，我的所有物种都是具有相关值（“值”）的列，即宽格式。

> dput(sample)
structure(list(EID = c("L00155/69/2000-09-06", "Q99107/178/1999-08-23", 
"G02192/1/2002-07-08", "G97158/1/1997-10-26", "Q06091/2/2006-07-04", 
"L00004/171/2000-03-01", "G11094/15/2011-09-05", "Q04127/16/2004-07-28", 
"Q02122/230/2002-10-29", "G08002/6/2008-02-03", "Q99006/143/1999-02-17", 
"Q08053/3/2008-06-12", "Q99128/22/1999-08-19", "L00177/83/2000-12-18", 
"Q05122/11/2005-08-30", "Q04156/44/2004-10-29", "L01097/69/2001-06-26", 
"G08004/169/2008-05-14", "Q03041/26/2003-06-14", "G98115/60/1998-09-11", 
"G00002/20/2000-01-17", "G00002/20/2000-01-17", "G00054/1/2000-05-31", 
"G00054/1/2000-05-31"), tspp.name = structure(c(13L, 13L, 13L, 
13L, 16L, 13L, 13L, 4L, 13L, 13L, 13L, 13L, 13L, 11L, 4L, 13L, 
13L, 13L, 13L, 20L, 13L, 13L, 24L, 24L), .Label = c("American plaice", 
"American sand lance", "Arctic cod", "Atlantic cod", "Atlantic halibut", 
"Atlantic herring", "Bigeye tuna", "Black dogfish", "Bluefin tuna", 
"Capelin", "Greenland halibut", "Lookdown", "Northern shrimp", 
"Ocean quahog", "Porbeagle", "Redfishes", "Slenteye headlightfish", 
"Smooth flounder", "Spiny dogfish", "Striped pink shrimp", "Summer flounder", 
"White hake", "Winter flounder", "Witch flounder", "Yellowtail flounder"
), class = "factor"), elasmo.name = structure(c(26L, 30L, 30L, 
30L, 30L, 25L, 21L, 30L, 30L, 30L, 30L, 21L, 30L, 5L, 30L, 30L, 
30L, 21L, 30L, 30L, 14L, 21L, 24L, 21L), .Label = c("Arctic skate", 
"Atlantic sharpnose shark", "Barndoor skate", "Basking shark", 
"Black dogfish", "Blue shark", "Deepsea cat shark", "Greenland shark", 
"Jensen's skate", "Little skate", "Manta", "Ocean quahog", "Oceanic whitetip shark", 
"Porbeagle", "Portuguese shark", "Rough sagre", "Roughtail stingray", 
"Round skate", "Sharks", "Shortfin mako", "Skates", "Smooth skate", 
"Soft skate", "Spiny dogfish", "Spinytail skate", "Thorny skate", 
"White shark", "White skate", "Winter skate", "NA"), class = "factor"), 
    elasmo.discard = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 
    25, 0, 0, 0, 1, 0, 0, 1, 1, 15, 25)), .Names = c("EID", "tspp.name", 
"elasmo.name", "elasmo.discard"), class = "data.frame", row.names = c("18496", 
"488791", "87549", "236671", "139268", "15606", "11132", "115531", 
"93441", "159675", "403751", "42587", "485941", "19285", "130395", 
"119974", "73826", "7953", "99124", "351461", "71", "72", "184", 
"185"))

最后，我希望得到这个：

library(plyr)
test<-dcast(sample, ...~elasmo.name,value.var ="elasmo.discard",fun.aggregate=sum)
test

请注意，“dcast”代码在这里工作，但是当我在包含145349行的整个数据集上运行它时，我会收到致命错误。

非常感谢!!

Answer 1

这将是哈德利之前的方法;首先聚合得到总和，然后重塑。

foo <- aggregate(d[,4,drop=FALSE], by=d[,1:3], sum)
reshape(foo, v.names="elasmo.discard", idvar=c("EID", "tspp.name"), 
             timevar="elasmo.name", direction="wide")

如果第一部分很慢，可能有助于在“by”部分中减少列数;看起来tspp.name由EID定义，如果是这样，请不要通过它聚合，而是在事后添加它。

如果第二部分很慢，也许可以尝试以下方法之一： https://stackoverflow.com/a/9617424/210673

为了更好地帮助加快速度，提供一个适当的示例（可能使用示例或代表），可以对代码进行测试。解决方案的速度通常取决于每个变量的唯一组合数。

Answer 2

我无法重现错误。请参阅附带的代码。我已将sample的行号增加到196608。

sample$elasmo.name中的类别数量可能起作用。

library(reshape2)

sample <- structure(list(EID = c("L00155/69/2000-09-06", "Q99107/178/1999-08-23", 
  "G02192/1/2002-07-08", "G97158/1/1997-10-26", "Q06091/2/2006-07-04", 
  "L00004/171/2000-03-01", "G11094/15/2011-09-05", "Q04127/16/2004-07-28", 
  "Q02122/230/2002-10-29", "G08002/6/2008-02-03", "Q99006/143/1999-02-17", 
  "Q08053/3/2008-06-12", "Q99128/22/1999-08-19", "L00177/83/2000-12-18", 
  "Q05122/11/2005-08-30", "Q04156/44/2004-10-29", "L01097/69/2001-06-26", 
  "G08004/169/2008-05-14", "Q03041/26/2003-06-14", "G98115/60/1998-09-11", 
  "G00002/20/2000-01-17", "G00002/20/2000-01-17", "G00054/1/2000-05-31", 
  "G00054/1/2000-05-31"), tspp.name = structure(c(13L, 13L, 13L, 
  13L, 16L, 13L, 13L, 4L, 13L, 13L, 13L, 13L, 13L, 11L, 4L, 13L, 
  13L, 13L, 13L, 20L, 13L, 13L, 24L, 24L), .Label = c("American plaice", 
  "American sand lance", "Arctic cod", "Atlantic cod", "Atlantic halibut", 
  "Atlantic herring", "Bigeye tuna", "Black dogfish", "Bluefin tuna", 
  "Capelin", "Greenland halibut", "Lookdown", "Northern shrimp", 
  "Ocean quahog", "Porbeagle", "Redfishes", "Slenteye headlightfish", 
  "Smooth flounder", "Spiny dogfish", "Striped pink shrimp", "Summer flounder", 
  "White hake", "Winter flounder", "Witch flounder", "Yellowtail flounder"
  ), class = "factor"), elasmo.name = structure(c(26L, 30L, 30L, 
  30L, 30L, 25L, 21L, 30L, 30L, 30L, 30L, 21L, 30L, 5L, 30L, 30L, 
  30L, 21L, 30L, 30L, 14L, 21L, 24L, 21L), .Label = c("Arctic skate", 
  "Atlantic sharpnose shark", "Barndoor skate", "Basking shark", 
  "Black dogfish", "Blue shark", "Deepsea cat shark", "Greenland shark", 
  "Jensen's skate", "Little skate", "Manta", "Ocean quahog", "Oceanic whitetip shark", 
  "Porbeagle", "Portuguese shark", "Rough sagre", "Roughtail stingray", 
  "Round skate", "Sharks", "Shortfin mako", "Skates", "Smooth skate", 
  "Soft skate", "Spiny dogfish", "Spinytail skate", "Thorny skate", 
  "White shark", "White skate", "Winter skate", "NA"), class = "factor"), 
      elasmo.discard = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 
      25, 0, 0, 0, 1, 0, 0, 1, 1, 15, 25)), .Names = c("EID", "tspp.name", 
  "elasmo.name", "elasmo.discard"), class = "data.frame", row.names = c("18496", 
  "488791", "87549", "236671", "139268", "15606", "11132", "115531", 
  "93441", "159675", "403751", "42587", "485941", "19285", "130395", 
  "119974", "73826", "7953", "99124", "351461", "71", "72", "184", 
  "185"))

n <- nrow(sample)
N <- 145349
p <- ceiling(log2(N / n))
n * 2^p
n * 2^p > N

# Bad way of increasing the row number
for (i in 1:p) sample <- rbind(sample, sample)

nrow(sample)

class(sample)
head(sample)

table(sample$elasmo.name)
table(as.character(sample$elasmo.name))

test <- dcast(sample, ... ~ elasmo.name,
              value.var = "elasmo.discard",
              fun.aggregate = sum)
head(test)

在不使用dcast（reshape2）的情况下重塑R中的数据

2 个答案: