Question

我有以下数据集（示例）。显然，我使用的真实数据集要大得多：

gvkey     tic  stko   year
001689   AEP1     1   2011
017096    BA3     1   2011
001440    AEP     0   2011
002285     BA     0   2011
001689   AEP1     1   2012
017096    BA3     1   2012
001440    AEP     0   2012
002285     BA     0   2012

以下是生成数据的代码：

dat <- data.frame(gvkey=c("001689", "017096", "001440", "002285"), tic=c("AEP1", "BA3", "AEP", "BA"), stko=c(1, 1, 0, 0), year=c(2011,2011,2011,2011,2012,2012,2012,2012))

以下是我想要做的事情：每行代表一年与公司配对，tic是公司的股票代码。 stko等于1的公司是子公司，与其母公司共享tic，但在股票代码中附有一个数字，例如AEP1属于AEP。基本上，我想创建一个新变量parent，它为每个子公司（stko=1行）指示母公司的gvkey。我想每年都这样做。最终数据集应如下所示：

gvkey     tic  stko   year  parent
001689   AEP1     1   2011  001440
017096    BA3     1   2011  002285
001440    AEP     0   2011  
002285     BA     0   2011  
001689   AEP1     1   2012  001440
017096    BA3     1   2012  002285
001440    AEP     0   2012  
002285     BA     0   2012

现在，我最初的方法是编写几个for循环，这些循环在给定年份迭代行。无论何时，stko=1，然后在末尾提取没有数字的股票代码部分（例如，对于第一行AEP），并在给定年份中找到具有此确切股票代码的行（例如，第3行为第一年） 2011）并使用gvkey将该行的stko=1复制到初始观察结果。

但是，考虑到我的数据集的大小，这个过程会非常慢。如果有人能想到更快更容易的方法，我将不胜感激。

非常感谢!!

使用我的主数据集，dput(droplevels(head(dat)))的输出为：

structure(list(gvkey = c("176017", "128663", "61586", "278120", 
"14062", "285313"), datadate = structure(c(4L, 4L, 1L, 3L, 2L, 
1L), .Label = c("31dec2010", "31dec2011", "31dec2012", "31dec2013"
), class = "factor"), fyear = c(2013, 2013, 2010, 2012, 2011, 
2010), indfmt = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "INDL", class = "factor"), 
consol = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "C", class = "factor"), 
popsrc = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "D", class = "factor"), 
datafmt = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "STD", class = "factor"), 
tic = c("ATHX", "SQNM", "IMH", "FNLIF", "CCDBF", "BSBR"), 
cusip = structure(c(1L, 6L, 5L, 4L, 3L, 2L), .Label = c("04744L106", 
"05967A107", "124900309", "33564P103", "45254P508", "817337405"
), class = "factor"), conm = structure(c(1L, 6L, 5L, 4L, 
3L, 2L), .Label = c("ATHERSYS INC", "BANCO SANTANDER BRASIL  -ADR", 
"CCL INDUSTRIES  -CL B", "FIRST NATIONAL FINL CORP", "IMPAC MORTGAGE HOLDINGS INC", 
"SEQUENOM INC"), class = "factor"), curcd = structure(c(2L, 
2L, 2L, 1L, 1L, 2L), .Label = c("CAD", "USD"), class = "factor"), 
costat = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "A", class = "factor"), 
stko = c(0, 0, 0, 0, 0, 0)), .Names = c("gvkey", "datadate", 
"fyear", "indfmt", "consol", "popsrc", "datafmt", "tic", "cusip", 
"conm", "curcd", "costat", "stko"), row.names = c(NA, 6L), class = "data.frame")

Answer 1

dplyr的另一个选项：

require(dplyr)

dat %>%
  mutate(tic2 = gsub("[0-9]", "", tic)) %>%
  group_by(tic2, year) %>%
  mutate(parent = ifelse(stko == 1, as.character(gvkey[stko == 0][1]), "")) %>%
  ungroup() %>%
  select(-tic2)


#Source: local data frame [8 x 5]
#
#   gvkey  tic stko year parent
#1 001689 AEP1    1 2011 001440
#2 017096  BA3    1 2011 002285
#3 001440  AEP    0 2011       
#4 002285   BA    0 2011       
#5 001689 AEP1    1 2012 001440
#6 017096  BA3    1 2012 002285
#7 001440  AEP    0 2012       
#8 002285   BA    0 2012

编辑：如果可能有没有匹配父级的公司，请尝试以下代码：

dat %>%
  mutate(tic2 = gsub("[0-9]", "", tic)) %>%
  group_by(tic2, year) %>%
  mutate(parent = ifelse(stko == 1 & sum(stko == 0) > 0, 
                         as.character(gvkey[stko == 0][1]), "")) %>%
  ungroup() %>%
  select(-tic2)

Answer 2

看起来你有一个可行的解决方案，但你正在寻找速度。为此，您将要使用子集和向量化操作而不是循环（请参阅this post）。我还会找到使用dplyr或data.table的解决方案，因为它们都比基础R快得多。

这是我尝试基于这些想法的解决方案。

require(dplyr)

parents <- dat %>%
  filter(stko == 0) %>%
  select(tic, gvkey) %>%
  unique(.)
rownames(parents) <- parents$tic

dat2 <- dat %>%
  mutate(
    parentTic = sub("[1-9]$", "", tic),
    parentGvkey = parents[parentTic, "gvkey"])

Answer 3

这是base::merge函数

中相当简单的练习

 rbind(transform(dat[dat$stko == 0, ], parent = ''),
   merge(
     transform(dat[dat$stko != 0, ], tic.parent = gsub('[0-9]', '', tic)),
     unique(transform(dat[dat$stko == 0, ], parent = gvkey)[, c('tic', 'parent', 'year')]),
     by.x = c('tic.parent', 'year'), by.y = c('tic', 'year')
   )[, -1]
 )

 #     gvkey  tic stko year parent
 # 3  001440  AEP    0 2011
 # 4  002285   BA    0 2011
 # 7  001440  AEP    0 2012
 # 8  002285   BA    0 2012
 # 5  001689 AEP1    1 2011 001440
 # 6  001689 AEP1    1 2012 001440
 # 71 017096  BA3    1 2011 002285
 # 81 017096  BA3    1 2012 002285

查找给定行的data.frame中的值

3 个答案: