R版本3.4.2
我试图根据基于同一数据帧的其他变量的条件创建3个新变量。我设法实现了我的目标,但它需要几行代码来生成输出,我认为其他方法(可能使用dplyr)可以毫不费力地提供。
这是一个可复制的例子:
city <- c("London", "London", "Leeds","Leeds", "Leeds", "Nottingham", "Glasgow", "Belfast", "Belfast", "Oxford", "Oxford", "Southampton", "Aberdeen", "Bath", "Bath", "Bath", "Preston", "Preston", "Liverpool", "Derby","Hereford")
transport <- c("cars", "scooters", "cars", "scooters", "bikes", "cars", "scooters", "cars", "bikes", "scooters", "bikes", "bikes", "scooters", "cars", "scooters", "bikes", "scooters", "bikes", "bikes", "cars", "bikes")
number <- c("153", "21", "267", "87", "13", "95", "17", "199", "8", "34", "5", "23", "40", "142", "79", "28", "37", "22", "19", "83", "23")
df <- data.frame(city, transport, number)
我想知道的是知道每个城市中每种交通工具的百分比,如下:
> df
city transport number pct.cars pct.scooters pct.bikes
1 London cars 153 87.93 12.07 0.00
2 London scooters 21 87.93 12.07 0.00
3 Leeds cars 267 72.75 23.71 3.54
4 Leeds scooters 87 72.75 23.71 3.54
5 Leeds bikes 13 72.75 23.71 3.54
6 Nottingham cars 95 100.00 0.00 0.00
7 Glasgow scooters 17 0.00 100.00 0.00
8 Belfast cars 199 96.14 0.00 3.86
9 Belfast bikes 8 96.14 0.00 3.86
10 Oxford scooters 34 0.00 87.18 12.82
11 Oxford bikes 5 0.00 87.18 12.82
12 Southampton bikes 23 0.00 0.00 100.00
13 Aberdeen scooters 40 0.00 100.00 0.00
14 Bath cars 142 57.03 31.73 11.24
15 Bath scooters 79 57.03 31.73 11.24
16 Bath bikes 28 57.03 31.73 11.24
17 Preston scooters 37 0.00 62.71 37.29
18 Preston bikes 22 0.00 62.71 37.29
19 Liverpool bikes 19 0.00 0.00 100.00
20 Derby cars 83 100.00 0.00 0.00
21 Hereford bikes 23 0.00 0.00 100.00
产生上述数据框的代码如下:
df <- tbl_df(df) %>%
mutate(., pct.cars = rep(as.numeric(0), length.out = length(df$city)),
pct.scooters = rep(as.numeric(0), length.out = length(df$city)),
pct.bikes = rep(as.numeric(0), length.out = length(df$city)))
for (i in 1:nrow(df)) {
cur_city <- df$city[i]
n_cars <- df$number[df$city == cur_city & df$transport == "cars"]
n_scooters <- df$number[df$city == cur_city & df$transport == "scooters"]
n_bikes <- df$number[df$city == cur_city & df$transport == "bikes"]
if (length(n_cars) == 1 & length(n_scooters) < 1 & length(n_bikes) < 1) {
# case: there are no scooters nor bikes
df$pct.cars[i] <- 100
df$pct.scooters[i] <- 0
df$pct.bikes[i] <- 0
} else if (length(n_cars) < 1 & length(n_scooters) == 1 & length(n_bikes) == 1) {
# case: there are no cars
df$pct.cars[i] <- 0
df$pct.scooters[i] <- (n_scooters/(n_scooters + n_bikes))*100
df$pct.bikes[i] <- (n_bikes/(n_scooters + n_bikes))*100
} else if (length(n_cars) == 1 & length(n_scooters) == 1 & length(n_bikes) < 1) {
# case: there are no bikes
df$pct.cars[i] <- (n_cars/(n_cars + n_scooters))*100
df$pct.scooters[i] <- (n_scooters/(n_cars + n_scooters))*100
df$pct.bikes[i] <- 0
} else if (length(n_cars) == 1 & length(n_scooters) < 1 & length(n_bikes) == 1) {
# case: there are no scooters
df$pct.cars[i] <- (n_cars/(n_cars + n_bikes))*100
df$pct.scooters[i] <- 0
df$pct.bikes[i] <- (n_bikes/(n_cars + n_bikes))*100
} else if (length(n_cars) < 1 & length(n_scooters) == 1 & length(n_bikes) < 1) {
# case: there are no cars nor bikes
df$pct.cars[i] <- 0
df$pct.scooters[i] <- 100
df$pct.bikes[i] <- 0
} else if (length(n_cars) < 1 & length(n_scooters) < 1 & length(n_bikes) == 1) {
# case: there are no cars nor scooters
df$pct.cars[i] <- 0
df$pct.scooters[i] <- 0
df$pct.bikes[i] <- 100
} else if (length(n_cars) == 1 & length(n_scooters) == 1 & length(n_bikes) == 1 ) {
# case: there are cars, scooters & bikes
df$pct.cars[i] <- (n_cars/(n_cars + n_scooters + n_bikes))*100
df$pct.scooters[i] <- (n_scooters/(n_cars + n_scooters + n_bikes))*100
df$pct.bikes[i] <- (n_bikes/(n_cars + n_scooters + n_bikes))*100
}
}
如果某人有一个更容易的解决方案或建议(可能使用dplyr),我们将不胜感激。提前谢谢!
答案 0 :(得分:1)
这样的事情怎么样?
df %>%
group_by(city) %>%
mutate(
number = as.numeric(as.character(number)),
ntot = sum(number),
percentage = number / ntot * 100,
id = paste0("perc.", transport)) %>%
spread(id, percentage) %>%
replace(., is.na(.), 0) %>%
select(-ntot) %>%
ungroup()
# city transport number perc.bikes perc.cars perc.scooters
# <fct> <fct> <dbl> <dbl> <dbl> <dbl>
# 1 Aberdeen scooters 40.0 0 0 100
# 2 Bath bikes 28.0 11.2 0 0
# 3 Bath cars 142 0 57.0 0
# 4 Bath scooters 79.0 0 0 31.7
# 5 Belfast bikes 8.00 3.86 0 0
# 6 Belfast cars 199 0 96.1 0
# 7 Derby cars 83.0 0 100 0
# 8 Glasgow scooters 17.0 0 0 100
# 9 Hereford bikes 23.0 100 0 0
#10 Leeds bikes 13.0 3.54 0 0
或者,如果您要填充所有百分比列中的条目:
df %>%
group_by(city) %>%
mutate(
number = as.numeric(as.character(number)),
ntot = sum(number),
percentage = number / ntot * 100,
id = paste0("perc.", transport)) %>%
spread(id, percentage) %>%
select(-ntot) %>%
fill(perc.bikes, perc.cars, perc.scooters, .direction = "up") %>%
fill(perc.bikes, perc.cars, perc.scooters, .direction = "down") %>%
replace(., is.na(.), 0) %>%
ungroup()
## A tibble: 21 x 6
# city transport number perc.bikes perc.cars perc.scooters
# <fct> <fct> <dbl> <dbl> <dbl> <dbl>
# 1 Aberdeen scooters 40.0 0 0 100
# 2 Bath bikes 28.0 11.2 57.0 31.7
# 3 Bath cars 142 11.2 57.0 31.7
# 4 Bath scooters 79.0 11.2 57.0 31.7
# 5 Belfast bikes 8.00 3.86 96.1 0
# 6 Belfast cars 199 3.86 96.1 0
# 7 Derby cars 83.0 0 100 0
# 8 Glasgow scooters 17.0 0 0 100
# 9 Hereford bikes 23.0 100 0 0
#10 Leeds bikes 13.0 3.54 72.8 23.7
## ... with 11 more rows
如果您只想要一个每个城市有百分比的表格,您可以这样做:
df %>%
group_by(city) %>%
mutate(
number = as.numeric(as.character(number)),
number = number / sum(ntot) * 100,
transport = paste0("perc.", transport)) %>%
spread(transport, number, fill = 0)
ungroup()
## A tibble: 13 x 4
# city perc.bikes perc.cars perc.scooters
# <fct> <dbl> <dbl> <dbl>
# 1 Aberdeen 0 0 100
# 2 Bath 11.2 57.0 31.7
# 3 Belfast 3.86 96.1 0
# 4 Derby 0 100 0
# 5 Glasgow 0 0 100
# 6 Hereford 100 0 0
# 7 Leeds 3.54 72.8 23.7
# 8 Liverpool 100 0 0
# 9 London 0 87.9 12.1
#10 Nottingham 0 100 0
#11 Oxford 12.8 0 87.2
#12 Preston 37.3 0 62.7
#13 Southampton 100 0 0
答案 1 :(得分:1)
有时表格/矩阵操作可以提供一个简单的解决方案:
df$number <- as.numeric(as.character(df$number))
prop.table(xtabs(number ~ city + transport, data=df), 1)
# transport
#city bikes cars scooters
# Aberdeen 0.00000000 0.00000000 1.00000000
# Bath 0.11244980 0.57028112 0.31726908
# Belfast 0.03864734 0.96135266 0.00000000
# Derby 0.00000000 1.00000000 0.00000000
# Glasgow 0.00000000 0.00000000 1.00000000
# Hereford 1.00000000 0.00000000 0.00000000
# Leeds 0.03542234 0.72752044 0.23705722
# Liverpool 1.00000000 0.00000000 0.00000000
# London 0.00000000 0.87931034 0.12068966
# Nottingham 0.00000000 1.00000000 0.00000000
# Oxford 0.12820513 0.00000000 0.87179487
# Preston 0.37288136 0.00000000 0.62711864
# Southampton 1.00000000 0.00000000 0.00000000
通过更改一个值,您还可以灵活地找到每个城市中某种交通工具的比例:
prop.table(xtabs(number ~ city + transport, data=df), 2)