我有一些公交车票务数据,我试图在任何公交车站找到公交车内的人数。 Here is the csv file.
kasa1 = read.csv("kasa1.csv")
mydata = kasa1[2:12]
max_stop = max(mydata[4], mydata[5])
min_stop = min(mydata[4], mydata[5])
stop_vector = seq(min_stop,max_stop,1)
occ_data = cumsum(sapply(stop_vector, function(x) (sum(mydata$passengers[mydata$from_stop_seq_no == x]) - sum(mydata$passengers[mydata$till_stop_seq_no == x]))))
这给出了任何公共汽车站内公交车内的总人数。
occ_data
[1] 585 585 600 546 584 583 584 571 549 547 560 558 560 536 535 550 458 457 433 417 350
[22] 224 224 225 211 232 152 152 165 151 147 147 147 134 147 109 109 107 101 86 88 62
[43] 62 62 62 62 0
但是,我试图为每个日期生成相同的结果。
这就是我希望我的输出是数据帧的方式,或者将行作为日期和列的矩阵作为总线停止ID:
例如:像这样的东西
1 2 3 ..... 47
2016-04-03 3 5 2 ..... 0
2016-04-30 1 2 0 ..... 0
2016-06-26 2 7 5 ..... 0
我正在尝试使用嵌套(s / l)apply来生成这种数据帧,但没有运气。
这是我的尝试:
occ_data = lapply(unique_dates, function(y) (lapply(stop_vector, function(x)
(sum(mydata$passengers[(mydata$from_stop_seq_no == x) & (mydata$ticket_date == y)]) -
sum(mydata$passengers[(mydata$till_stop_seq_no == x) & (mydata$ticket_date == y)])))))
编辑: 如果您无法访问csv文件
dput(kasa1)
structure(list(X = c(7783L, 17367L, 26616L, 28282L, 40071L, 50084L,
65608L, 66696L, 69662L, 74709L, 80904L, 82697L, 84394L, 93137L,
96306L, 102215L, 113483L, 130248L, 137158L, 137817L, 140604L,
142983L, 144318L, 145274L, 152069L, 156718L, 156943L, 184117L,
204043L, 207531L, 213158L, 231711L, 233410L, 239997L, 246074L,
259077L, 271726L, 272231L, 278522L, 284472L, 293683L, 296160L,
296757L, 299797L, 307623L, 311950L, 313409L, 316974L, 319620L,
322003L, 322504L, 330470L, 338110L, 340164L, 341619L, 349942L,
355707L, 363601L, 373094L, 386332L, 386380L, 393546L, 399955L,
402978L, 409051L, 413489L, 433396L, 433560L, 440140L, 441809L,
449981L, 452158L, 456117L, 457580L, 461605L, 461801L, 473184L,
482865L, 484450L, 491844L, 493962L, 495029L, 499000L, 525130L,
526204L, 526484L, 527785L, 528874L, 533681L, 533915L, 561859L,
571615L, 580474L, 585402L, 588904L, 601186L, 601270L, 611508L,
620406L, 626943L, 665902L, 674584L, 697482L, 703223L, 703686L,
704883L, 719594L, 725211L, 732220L, 732474L, 733275L, 743833L,
744555L, 748028L, 750119L, 768430L), schedule_no = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "V-335E/13", class = "factor"), trip_no = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), route_no = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V-335EUP", class = "factor"),
from_stop_seq_no = c(21L, 21L, 21L, 1L, 21L, 11L, 17L, 17L,
1L, 1L, 16L, 1L, 1L, 26L, 1L, 1L, 1L, 9L, 1L, 1L, 35L, 30L,
1L, 1L, 1L, 1L, 5L, 1L, 1L, 41L, 1L, 29L, 1L, 1L, 25L, 1L,
16L, 1L, 1L, 1L, 14L, 1L, 1L, 13L, 1L, 1L, 1L, 1L, 41L, 1L,
1L, 29L, 1L, 1L, 3L, 3L, 1L, 1L, 24L, 17L, 9L, 9L, 1L, 1L,
1L, 11L, 1L, 38L, 1L, 1L, 1L, 1L, 5L, 1L, 3L, 1L, 1L, 21L,
35L, 21L, 1L, 1L, 1L, 1L, 7L, 21L, 1L, 8L, 16L, 1L, 13L,
17L, 1L, 21L, 1L, 5L, 21L, 16L, 21L, 5L, 1L, 3L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 19L, 14L, 1L, 1L, 1L, 1L, 5L), till_stop_seq_no = c(47L,
36L, 27L, 22L, 36L, 38L, 26L, 31L, 47L, 40L, 22L, 6L, 25L,
36L, 26L, 22L, 18L, 22L, 22L, 17L, 47L, 38L, 40L, 8L, 34L,
47L, 17L, 20L, 21L, 47L, 14L, 47L, 22L, 22L, 40L, 9L, 21L,
19L, 8L, 12L, 34L, 15L, 27L, 39L, 14L, 30L, 22L, 17L, 47L,
27L, 10L, 36L, 4L, 25L, 47L, 42L, 22L, 22L, 27L, 31L, 36L,
17L, 21L, 21L, 27L, 40L, 39L, 47L, 21L, 47L, 25L, 20L, 20L,
19L, 27L, 17L, 19L, 31L, 42L, 27L, 8L, 22L, 17L, 14L, 30L,
47L, 22L, 12L, 30L, 25L, 39L, 22L, 20L, 27L, 30L, 22L, 39L,
42L, 27L, 10L, 9L, 14L, 39L, 9L, 47L, 42L, 22L, 38L, 21L,
31L, 20L, 39L, 22L, 47L, 17L, 17L), from_bus_stop_code_english = structure(c(12L,
12L, 12L, 8L, 12L, 5L, 12L, 12L, 8L, 8L, 5L, 8L, 8L, 1L,
8L, 8L, 8L, 2L, 8L, 8L, 15L, 15L, 8L, 8L, 8L, 8L, 13L, 8L,
8L, 6L, 8L, 3L, 8L, 8L, 3L, 8L, 5L, 8L, 8L, 8L, 4L, 8L, 8L,
11L, 8L, 8L, 8L, 8L, 6L, 8L, 8L, 3L, 8L, 8L, 14L, 14L, 8L,
8L, 9L, 12L, 2L, 2L, 8L, 8L, 8L, 5L, 8L, 7L, 8L, 8L, 8L,
8L, 2L, 8L, 14L, 8L, 8L, 1L, 15L, 12L, 8L, 8L, 8L, 8L, 10L,
12L, 8L, 11L, 5L, 8L, 11L, 12L, 8L, 12L, 8L, 13L, 12L, 5L,
12L, 13L, 8L, 14L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 4L, 8L,
8L, 8L, 8L, 13L), .Label = c("AECSL-1", "DMLUR", "GRPI",
"HALKM", "HALMG", "HP F", "ITL", "KBS1", "KUDGT", "MAO",
"MPH", "MTHB-1", "MYHL", "RICHMD", "SJRHS"), class = "factor"),
till_bus_stop_code_english = structure(c(8L, 17L, 1L, 1L,
17L, 6L, 3L, 17L, 8L, 8L, 14L, 15L, 10L, 17L, 3L, 1L, 14L,
14L, 14L, 4L, 8L, 6L, 9L, 11L, 7L, 8L, 4L, 10L, 13L, 8L,
12L, 8L, 14L, 14L, 8L, 12L, 13L, 5L, 11L, 4L, 7L, 5L, 1L,
7L, 12L, 3L, 1L, 4L, 8L, 1L, 2L, 17L, 16L, 10L, 8L, 6L, 1L,
1L, 1L, 17L, 17L, 4L, 13L, 13L, 1L, 9L, 7L, 8L, 13L, 8L,
10L, 10L, 10L, 5L, 1L, 4L, 5L, 17L, 6L, 1L, 11L, 14L, 4L,
12L, 3L, 8L, 1L, 4L, 3L, 10L, 7L, 1L, 10L, 1L, 3L, 14L, 7L,
6L, 1L, 2L, 12L, 12L, 7L, 12L, 8L, 6L, 14L, 6L, 13L, 17L,
10L, 7L, 14L, 8L, 4L, 4L), .Label = c("AECSL-1", "DMLUR",
"GRPI", "HALMG", "HALPS1", "HP F", "ITL", "KDG0", "KDGDBS",
"KUDGT", "MAO", "MPH", "MRH", "MTHB-1", "MYHL", "RICHMD",
"SJRHS"), class = "factor"), tickets = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 23L, 1L, 12L, 1L, 1L,
1L, 12L, 1L, 1L, 1L, 1L, 1L, 23L, 1L, 1L, 1L, 1L, 23L, 1L,
1L, 1L, 1L, 23L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L,
1L, 12L, 1L, 12L, 1L, 1L, 44L, 1L, 1L, 1L, 34L, 1L, 1L, 1L,
1L, 12L, 12L, 1L, 34L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L,
1L, 1L, 12L, 1L, 12L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 1L, 1L, 12L, 1L,
1L, 12L, 1L, 1L, 1L, 1L, 23L, 1L, 1L, 1L, 1L, 1L, 1L, 12L
), passengers = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
12L, 1L, 1L, 23L, 1L, 12L, 1L, 1L, 12L, 23L, 1L, 1L, 1L,
1L, 1L, 50L, 1L, 1L, 1L, 1L, 23L, 1L, 1L, 12L, 1L, 23L, 1L,
1L, 1L, 1L, 12L, 1L, 12L, 1L, 12L, 1L, 1L, 12L, 1L, 12L,
1L, 12L, 54L, 1L, 1L, 12L, 34L, 1L, 1L, 1L, 1L, 23L, 12L,
12L, 50L, 12L, 1L, 1L, 1L, 1L, 12L, 12L, 1L, 12L, 1L, 12L,
12L, 1L, 12L, 1L, 12L, 1L, 12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 12L, 23L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 12L,
1L, 1L, 1L, 1L, 50L, 1L, 1L, 1L, 12L, 1L, 1L, 12L), total_ticket_amount = c(416L,
347L, 158L, 496L, 347L, 465L, 280L, 347L, 14L, 14L, 465L,
347L, 496L, 416L, 496L, 81L, 496L, 376L, 113L, 170L, 280L,
158L, 14L, 376L, 541L, 433L, 347L, 496L, 465L, 97L, 133L,
347L, 496L, 113L, 347L, 133L, 158L, 465L, 376L, 465L, 43L,
465L, 113L, 465L, 14L, 541L, 496L, 81L, 97L, 113L, 416L,
249L, 233L, 496L, 541L, 146L, 259L, 496L, 97L, 347L, 465L,
416L, 81L, 81L, 363L, 81L, 541L, 158L, 465L, 14L, 113L, 113L,
416L, 81L, 465L, 81L, 81L, 158L, 316L, 158L, 2L, 496L, 81L,
416L, 465L, 416L, 496L, 97L, 416L, 496L, 465L, 158L, 496L,
158L, 146L, 133L, 376L, 465L, 158L, 158L, 43L, 347L, 541L,
43L, 14L, 14L, 496L, 14L, 306L, 280L, 280L, 541L, 113L, 14L,
465L, 528L), ticket_date = structure(c(9L, 12L, 22L, 2L,
23L, 3L, 6L, 3L, 8L, 7L, 26L, 22L, 20L, 19L, 2L, 5L, 6L,
22L, 12L, 20L, 10L, 3L, 5L, 9L, 5L, 16L, 18L, 1L, 16L, 9L,
24L, 22L, 26L, 22L, 6L, 2L, 21L, 22L, 22L, 3L, 3L, 6L, 8L,
19L, 26L, 15L, 7L, 26L, 15L, 9L, 22L, 19L, 25L, 16L, 10L,
26L, 6L, 1L, 19L, 6L, 14L, 17L, 9L, 20L, 20L, 3L, 13L, 8L,
22L, 20L, 8L, 2L, 7L, 24L, 26L, 9L, 9L, 6L, 19L, 20L, 24L,
11L, 24L, 9L, 16L, 10L, 4L, 6L, 19L, 26L, 13L, 6L, 7L, 9L,
22L, 17L, 20L, 17L, 18L, 17L, 6L, 25L, 22L, 5L, 15L, 15L,
24L, 6L, 24L, 1L, 5L, 24L, 20L, 9L, 22L, 17L), .Label = c("2015-11-28",
"2015-12-12", "2016-01-09", "2016-01-10", "2016-01-17", "2016-02-13",
"2016-02-28", "2016-04-02", "2016-04-03", "2016-04-16", "2016-04-17",
"2016-04-30", "2016-05-08", "2016-05-14", "2016-05-29", "2016-06-04",
"2016-06-05", "2016-06-12", "2016-06-18", "2016-06-19", "2016-06-25",
"2016-06-26", "2016-07-02", "2016-07-03", "2016-07-09", "2016-07-17"
), class = "factor")), .Names = c("X", "schedule_no", "trip_no",
"route_no", "from_stop_seq_no", "till_stop_seq_no", "from_bus_stop_code_english",
"till_bus_stop_code_english", "tickets", "passengers", "total_ticket_amount",
"ticket_date"), class = "data.frame", row.names = c(NA, -116L
))
答案 0 :(得分:1)
这是一种方式。首先用。创建一个中间数据框架 每站上下车的乘客数量
stops <- unique(unlist(kasa1[c('from_stop_seq_no', 'till_stop_seq_no')]))
tmp <- lapply(
split(kasa1, kasa1$ticket_date),
function (x) {
lapply(stops, function (i) {
data.frame(
stop = i,
date = x$ticket_date[1],
passengers.in = sum(x[x$from_stop_seq_no == i, 'passengers']),
passengers.out = sum(x[x$till_stop_seq_no == i, 'passengers']))
})
})
kasa2 <- do.call(rbind, unlist(unname(tmp), recursive = FALSE))
head(kasa2)
# stop date passengers.in passengers.out
#1 21 2015-11-28 0 0
#2 1 2015-11-28 2 0
#3 11 2015-11-28 0 0
#4 17 2015-11-28 0 0
#5 16 2015-11-28 0 0
#6 26 2015-11-28 0 0
现在我们可以计算每站巴士上的乘客数量
sapply(split(kasa2, kasa2$date),
function (x) {
with(x[order(x$stop), ],
cumsum(passengers.in) - cumsum(passengers.out))
})
答案 1 :(得分:1)
这是另一种使用data.table
的方法,可以为您提供所需的结果。我只保留了您感兴趣的三个变量:stop_seq_no
,occ_data
和ticket_date
但您可以将此方法应用于可能需要的任何变量。我更喜欢使用data.table
,因为它对于大型数据集要快得多。您也可以考虑使用某些用户发现的as.IDate
更有效率。
library(data.table)
dat <- fread("kasa1.csv")
dat$ticket_date <- as.Date(dat$ticket_date, format = "%Y-%m-%d")
# function to give occ_data for each stop for a given date
func1 <- function(z, dat){
mydata <- dat[ticket_date == z]
newdat <- data.table(ticket_date = z,
stop_seq_no = unique(c(mydata$from_stop_seq_no, mydata$till_stop_seq_no)),
occ_data = cumsum(sapply(unique(c(mydata$from_stop_seq_no, mydata$till_stop_seq_no)),
function(x) {
sum(mydata$passengers[mydata$from_stop_seq_no == x]) -
sum(mydata$passengers[mydata$till_stop_seq_no == x])
} )) )
return(newdat)
}
list_data <- lapply(unique(dat$ticket_date), function(z) func1(z, dat) ) #apply function by date for all ticket_date in the dataset
merged_data <- rbindlist(list_data, use.names = T, fill = T) # merge results into a single data.table
# order data by ticket_date and stop_seq_no
merged_data[order(ticket_date, stop_seq_no)]
编辑: 这是输出:
merged_data[order(ticket_date, stop_seq_no)]
ticket_date stop_seq_no occ_data
1: 2015-11-28 1 2
2: 2015-11-28 19 3
3: 2015-11-28 20 2
4: 2015-11-28 22 1
5: 2015-11-28 31 0
---
153: 2016-07-17 17 14
154: 2016-07-17 22 38
155: 2016-07-17 25 0
156: 2016-07-17 27 1
157: 2016-07-17 42 2
希望这有用!