我有一个相当大的数据集(15.000行),我需要根据数据结构对每一行进行计算。我的数据集中有一列需要进一步拆分。以下是一个例子:
date <- c("2015-07-10", "2013-05-06", "2017-08-10")
Number <- c(345, 231, 10)
Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67")
Dep <- c("FGC","HAM","ICAO")
Plan <- data.frame(date, Number, Route, Dep)
对我来说,重要的信息在“路线”栏目中。我需要从此列生成aggreagted功能。列的每个单元格中的信息需要通过“;”
进行拆分到目前为止我尝试了什么:
选择一行
只用这一行创建一个新的数据框。
在“Route”列上使用mutate和unnest将其拆分为“;”指向并为每个
创建一个新行测试&lt; - 计划[1,]
test&lt; - test%&gt;%mutate(Route = strsplit(as.character(Route),“;”))%&gt;%unfst(Route)
使用cSplit将“路径”列中的信息拆分为“:”
test = cSplit(test, "Route", ":")
然后我对这个数据子集进行计算。
我创建变量x,y,z来保存我的计算
x1 <- mean(test$Route_2)
y1 <- max(test$Route_5)
z1 <- min(test$Route_8)
两个问题:
如何为原始数据集中的所有行自动执行此操作? 如何将保存的变量(x,y,z)中的数据合并回原始数据框?
渴望输出 (这些不是x2和x3数据的实际值,只是一个例子)
x1 <- 12
y1 <- 86363
z1 <- 7383
x2 <- 45
y2 <- 6754
z2 <- 3553
x3 <- 5648
y3 <- 64
z3 <- 6363
Plan$x <- c(x1,x2,x3)
Plan$y <- c(y1, y2, y3)
Plan$z <- c(z1,z2,z3)
head(Plan)
全部示例代码
library(splitstackshape)
library(plyr)
library(tidyr)
date <- c("2015-07-10", "2013-05-06", "2017-08-10")
Number <- c(345, 231, 10)
Route <- c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67")
Dep <- c("FGC","HAM","ICAO")
Plan <- data.frame(date, Number, Route, Dep)
test <- Plan[1,]
test <- test %>% mutate(Route=strsplit(as.character(Route), ";")) %>% unnest(Route)
test = cSplit(test, "Route", ":")
x1 <- mean(test$Route_2)
y1 <- max(test$Route_5)
z1 <- min(test$Route_8)
x2 <- 45
y2 <- 6754
z2 <- 3553
x3 <- 5648
y3 <- 64
z3 <- 6363
Plan$x <- c(x1,x2,x3)
Plan$y <- c(y1, y2, y3)
Plan$z <- c(z1,z2,z3)
head(Plan)
答案 0 :(得分:2)
以下是我使用library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
# This function takes a single item from Plan$Route, splits it into its
# relevant columns and then finds the mean of columns 2, 5 and 8.
route_extract <- function(route) {
cols <- str_split(route, fixed(":"), simplify = TRUE)[, c(2, 5, 8), drop = FALSE]
# Converts the matrix to numeric without losing dimensions
storage.mode(cols) <- "numeric"
# Calculate the column means and then return the result as a `tibble`
cm <- colMeans(cols)
tibble(x = cm[1], y = cm[2], z = cm[3])
}
route_calc <- function(routes) {
str_split(routes, fixed(";")) %>%
map_df(route_extract)
}
Plan <- bind_cols(Plan, route_calc(Plan$Route))
套餐的方式:
TypeError at /realestateprogram/edition_appartement/19/
__init__() got an unexpected keyword argument 'user'
答案 1 :(得分:2)
创建一个名为Route_tmp
的第二个临时Route列,并从中为每个组件生成一个单独的行,以分号分隔,然后将生成的Route_tmp
变量用冒号分隔成单独的列。现在按原始变量分组,我们采用所需列的平均值。 (请注意,如果我们在输出中不需要Route
,那么我们可能会在顶部省略mutate
并使用Route
代替Route_tmp
。)
library(dplyr)
library(tidyr)
out <- Plan %>%
mutate(Route_tmp = Route) %>%
separate_rows(Route_tmp, sep = ";") %>%
separate(Route_tmp, as.character(1:8), convert = TRUE) %>%
group_by(date, Number, Route, Dep) %>%
summarize(x = mean(`2`), y = mean(`5`), z = mean(`8`)) %>%
ungroup
给出以下内容(我们不显示Route列以便于阅读):
> out[-3]
# A tibble: 3 × 6
date Number Dep x y z
<fctr> <dbl> <fctr> <dbl> <dbl> <dbl>
1 2013-05-06 231 HAM 8224.333 17 33.66667
2 2015-07-10 345 FGC 8224.333 17 33.66667
3 2017-08-10 10 ICAO 8224.333 17 33.66667
注意:由于计划在问题中被覆盖,我不清楚究竟哪个版本的计划是输入,但我已经假设:
Plan <- data.frame(date = c("2015-07-10", "2013-05-06", "2017-08-10"),
Number = c(345, 231, 10),
Route = c("GCLP:10011:-8848:56:-4:270:260:12;LPC:1211:-828:56:-2:22:220:22;GCCC:13451:-85458:556:-45:45:76:67", "DPAP:10011:-8848:56:-4:270:260:12;LTTC:1211:-828:56:-2:22:220:22;ATCH:13451:-85458:556:-45:45:76:67", "AMN:10011:-8848:56:-4:270:260:12;RET:1211:-828:56:-2:22:220:22;LLOP:13451:-85458:556:-45:45:76:67"),
Dep = c("FGC","HAM","ICAO"))