Question

我在R中有两个不同的数据框。第一个df1包含一堆城市的数据，每个城市都与它的对应状态相关联。第二个包含按状态汇总的数据，但将这些数据分为Seral类。像这样：

states1 <- c("a", "a", "a", "a", "a", "b", "b", "b", "b", "c", "c", "d", "d", "d")
cities <- c("A","B","C","D","E","F","G","H","I","J","K","L","M","N")
data1 <- c(123, 222, 444, 125, 687, 987, 556, 445, 444, 659, 798, 113, 325, 144) 
df1 <- data.frame(states1, cities, data1)
#
states2 <- c("a","a","b","b","c","c","d","d")
classes <- c(1,2,1,2,1,2,1,2)
data2 <- c(65,21,44,25,37,87,58,47)
df2 <- data.frame(states2, classes, data2)

我想基于特定条件将两个数据帧中的数据列相乘，以创建第三个数据帧。我想为每个城市将其数据乘以其相应州的数据，创建两列，每个类别一列。

例如：

对于属于状态a的城市A和B，以及属于状态c的城市K，我需要将其数据乘以各自州的数据，然后对1类和2类分别进行处理。是的，我想使用相应的州作为我的匹配标准，将城市的数据乘以两个类别的数据。像这样，例如：

multA <- c(123*65, 123*21)
multB <- c(222*65, 222*21)
multK <- c(798*37, 789*87)

df3 <- data.frame(rbind(multA, multB, multK))
colnames(df3) <- c("class 1", "class 2")

但是，当然，我想针对每个城市自动执行此操作。我尝试使用which函数和dplyr包，但到目前为止，我还没有提出解决方案。没有封装或内置函数，无需编写显式循环，有什么方法可以做到？谢谢！

Answer 1

有很多方法可以“剥皮”，但是我偏爱sql，因此我将使用sqldf和reshape2包来完成此操作：

library("sqldf")
#Execute query to merge df1 to df2 based on state, 
#extracting the city name, classes, and performing the necessary math
df4<-sqldf("select 
                   a.cities, 
                   b.classes, 
                   a.data1*data2 as class1 
            from 
                   df1 a join df2 b 
            where 
                   a.states1=b.states2"
           )

#Since the data is now in "long form" we need to reshape it 
#so that for each class, you create a column, and the city becomes the row
library(reshape2)
df4_reshaped<-dcast(df4, formula=cities~classes, value.var="class1")
names(df4_reshaped)<-c("city", "class 1", "class 2")

这将导致一个以城市为列名的数据框（我个人认为它比像在df3中将其作为行名更有用）：

> df4_reshaped
   city class 1 class 2
1     A    7995    2583
2     B   14430    4662
3     C   28860    9324
4     D    8125    2625
5     E   44655   14427
6     F   43428   24675
7     G   24464   13900
8     H   19580   11125
9     I   19536   11100
10    J   24383   57333
11    K   29526   69426
12    L    6554    5311
13    M   18850   15275
14    N    8352    6768

要获得您想要的精确形状，您需要设置行名并删除城市列：

#set rowname
rownames(df4_reshaped)<-paste("mult", df4_reshaped$city, sep="")

#remove city column
df4_reshaped2<-df4_reshaped[,-1]

所以您现在的最终输出是：

> df4_reshaped2
      class 1 class 2
multA    7995    2583
multB   14430    4662
multC   28860    9324
multD    8125    2625
multE   44655   14427
multF   43428   24675
multG   24464   13900
multH   19580   11125
multI   19536   11100
multJ   24383   57333
multK   29526   69426
multL    6554    5311
multM   18850   15275
multN    8352    6768

Answer 2

这是使用dplyr的解决方案：

df1 %>% 
  left_join(df2, by = c('states1' = 'states2')) %>%
  mutate(vals = data1 * data2) %>%
  group_by(states1, cities) %>%
  summarise(`class 1` = vals[classes == 1],
            `class 2` = vals[classes == 2])

# A tibble: 14 x 4
# Groups:   states1 [?]
   states1 cities `class 1` `class 2`
   <fct>   <fct>      <dbl>     <dbl>
 1 a       A           7995      2583
 2 a       B          14430      4662
 3 a       C          28860      9324
 4 a       D           8125      2625
 5 a       E          44655     14427
 6 b       F          43428     24675
 7 b       G          24464     13900
 8 b       H          19580     11125
 9 b       I          19536     11100
10 c       J          24383     57333
11 c       K          29526     69426
12 d       L           6554      5311
13 d       M          18850     15275
14 d       N           8352      6768

Answer 3

要在注释中扩展@Frank的基本R解决方案，请考虑 df2 的reshape，然后为数字列乘积merge transform设置两组# RESHAPE LONG TO WIDE rdf2 <- reshape(df2, idvar="states2", v.names="data2", timevar="classes", direction="wide") # RENAME COLUMNS rdf2 <- setNames(rdf2, c("states2", "class1", "class2")) # MERGE AND TRANSFORM final_df <- transform(merge(df1, rdf2, by.x="states1", by.y="states2"), class1 = data1 * class1, class2 = data1 * class2) final_df # states1 cities data1 class1 class2 # 1 a A 123 7995 2583 # 2 a B 222 14430 4662 # 3 a C 444 28860 9324 # 4 a D 125 8125 2625 # 5 a E 687 44655 14427 # 6 b F 987 43428 24675 # 7 b G 556 24464 13900 # 8 b H 445 19580 11125 # 9 b I 444 19536 11100 # 10 c J 659 24383 57333 # 11 c K 798 29526 69426 # 12 d L 113 6554 5311 # 13 d M 325 18850 15275 # 14 d N 144 8352 6768 ：

b_array

乘以2个数据帧大小不同的列

3 个答案: