Question

我有两个不同长度的数据框，每个数据框有3列：＆＃39; name＆＃39; ＆＃39;量＆＃39; ＆＃39;一年＆＃39;

我希望能够识别两个数据框中的常见“名称”变量，并计算每个变量之间的数量差异。例如：

 name year amount difference
 James 2010  934706   -2340
 Rory  2010   869691  240576
 Suzanne 2010 651674   37765
 Felicity 2010 386115 -512275
 Oliver   2010  382388 -278410

到目前为止，我有：

common_order <- intersect(colnames(d1), colnames(d2)) #finds the common variables between the two dataframes
d2=d2[, common_order] #variables in d2 in the same order as d1
d1=d1[, common_order]
d3=rbind(d1,d2) #binds 2 data frames together

d4=aggregate(d3[,1:3],list(d3$name),(d2-d1)) # this line of code is not working as the data frames are not equally sized

有没有办法做到这一点？

dput(head(d1, 20))
structure(list(year = c(1950, 1950, 1950, 1950, 1950, 1950, 1950, 
1950, 1950, 1950, 1950, 1950, 1950, 1950, 1950, 1950, 1950, 1950, 
1950, 1950), name = c("Linda", "Mary", "Patricia", "Barbara", 
"Susan", "Nancy", "Deborah", "Sandra", "Carol", "Kathleen", "Sharon", 
"Karen", "Donna", "Brenda", "Margaret", "Diane", "Pamela", "Janet", 
"Shirley", "Carolyn"), sex = c("F", "F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
"F"), amount = c(80412L, 65443L, 47920L, 41560L, 38019L, 29618L, 
29074L, 28885L, 26159L, 25693L, 24612L, 24137L, 21633L, 20797L, 
18111L, 17616L, 16201L, 16189L, 15876L, 15593L)), .Names = c("year", 
"name", "sex", "amount"), row.names = c(NA, 20L), class = "data.frame")

dput(head(d2, 20))
structure(list(year = c(2010, 2010, 2010, 2010, 2010, 2010, 2010, 
2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 
2010, 2010), name = c("Isabella", "Sophia", "Emma", "Olivia", 
"Ava", "Emily", "Abigail", "Madison", "Chloe", "Mia", "Addison", 
"Elizabeth", "Ella", "Natalie", "Samantha", "Alexis", "Lily", 
"Grace", "Hailey", "Alyssa"), sex = c("F", "F", "F", "F", "F", 
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", 
"F", "F"), amount = c(22822L, 20566L, 17277L, 16959L, 15382L, 
14235L, 14208L, 13133L, 11715L, 10599L, 10302L, 10209L, 9844L, 
8746L, 8378L, 8222L, 7947L, 7651L, 6993L, 6963L)), .Names = c("year", 
"name", "sex", "amount"), row.names = c(NA, 20L), class = "data.frame")

Answer 1

尝试：

 transform(merge(d1,d2[,-3], by='name'), 
              difference=amount.x-amount.y)[,-c(4,6)]

  #       name year.x sex year.y difference
  #1     Carol   1950   F   2010       5593
  #2   Carolyn   1950   F   2010      -1366
  #3     Karen   1950   F   2010       1315
  #4  Kathleen   1950   F   2010      11485
  #5      Mary   1950   F   2010      57792
  #6  Patricia   1950   F   2010      40957
  #7    Sandra   1950   F   2010      18286
  #8    Sharon   1950   F   2010      14403
  #9   Shirley   1950   F   2010       5574
  #10    Susan   1950   F   2010      30072

数据

将d2更改为overlapping个名称

 set.seed(24)
 d2$name <- sample(c(d1$name, d2$name), 20, replace=FALSE)

比较两个数据帧之间的值差异

1 个答案:

数据