嵌套适用于R

时间:2017-11-06 13:59:23

标签: r dataframe

我有一些公交车票务数据,我试图在任何公交车站找到公交车内的人数。 Here is the csv file.

kasa1 = read.csv("kasa1.csv")

mydata = kasa1[2:12]
max_stop = max(mydata[4], mydata[5])
min_stop = min(mydata[4], mydata[5])
stop_vector = seq(min_stop,max_stop,1)


occ_data = cumsum(sapply(stop_vector, function(x) (sum(mydata$passengers[mydata$from_stop_seq_no == x]) - sum(mydata$passengers[mydata$till_stop_seq_no == x]))))

这给出了任何公共汽车站内公交车内的总人数。

occ_data
 [1] 585 585 600 546 584 583 584 571 549 547 560 558 560 536 535 550 458 457 433 417 350
[22] 224 224 225 211 232 152 152 165 151 147 147 147 134 147 109 109 107 101  86  88  62
[43]  62  62  62  62   0

但是,我试图为每个日期生成相同的结果。

这就是我希望我的输出是数据帧的方式,或者将行作为日期和列的矩阵作为总线停止ID:

例如:像这样的东西

           1    2     3 .....           47
2016-04-03 3    5     2 .....           0

2016-04-30 1   2      0 .....           0

2016-06-26 2   7      5 .....           0

我正在尝试使用嵌套(s / l)apply来生成这种数据帧,但没有运气。

这是我的尝试:

occ_data = lapply(unique_dates, function(y) (lapply(stop_vector, function(x) 
                    (sum(mydata$passengers[(mydata$from_stop_seq_no == x) & (mydata$ticket_date == y)]) - 
                       sum(mydata$passengers[(mydata$till_stop_seq_no == x) & (mydata$ticket_date == y)])))))

编辑: 如果您无法访问csv文件

dput(kasa1)
structure(list(X = c(7783L, 17367L, 26616L, 28282L, 40071L, 50084L, 
65608L, 66696L, 69662L, 74709L, 80904L, 82697L, 84394L, 93137L, 
96306L, 102215L, 113483L, 130248L, 137158L, 137817L, 140604L, 
142983L, 144318L, 145274L, 152069L, 156718L, 156943L, 184117L, 
204043L, 207531L, 213158L, 231711L, 233410L, 239997L, 246074L, 
259077L, 271726L, 272231L, 278522L, 284472L, 293683L, 296160L, 
296757L, 299797L, 307623L, 311950L, 313409L, 316974L, 319620L, 
322003L, 322504L, 330470L, 338110L, 340164L, 341619L, 349942L, 
355707L, 363601L, 373094L, 386332L, 386380L, 393546L, 399955L, 
402978L, 409051L, 413489L, 433396L, 433560L, 440140L, 441809L, 
449981L, 452158L, 456117L, 457580L, 461605L, 461801L, 473184L, 
482865L, 484450L, 491844L, 493962L, 495029L, 499000L, 525130L, 
526204L, 526484L, 527785L, 528874L, 533681L, 533915L, 561859L, 
571615L, 580474L, 585402L, 588904L, 601186L, 601270L, 611508L, 
620406L, 626943L, 665902L, 674584L, 697482L, 703223L, 703686L, 
704883L, 719594L, 725211L, 732220L, 732474L, 733275L, 743833L, 
744555L, 748028L, 750119L, 768430L), schedule_no = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "V-335E/13", class = "factor"), trip_no = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), route_no = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "V-335EUP", class = "factor"), 
    from_stop_seq_no = c(21L, 21L, 21L, 1L, 21L, 11L, 17L, 17L, 
    1L, 1L, 16L, 1L, 1L, 26L, 1L, 1L, 1L, 9L, 1L, 1L, 35L, 30L, 
    1L, 1L, 1L, 1L, 5L, 1L, 1L, 41L, 1L, 29L, 1L, 1L, 25L, 1L, 
    16L, 1L, 1L, 1L, 14L, 1L, 1L, 13L, 1L, 1L, 1L, 1L, 41L, 1L, 
    1L, 29L, 1L, 1L, 3L, 3L, 1L, 1L, 24L, 17L, 9L, 9L, 1L, 1L, 
    1L, 11L, 1L, 38L, 1L, 1L, 1L, 1L, 5L, 1L, 3L, 1L, 1L, 21L, 
    35L, 21L, 1L, 1L, 1L, 1L, 7L, 21L, 1L, 8L, 16L, 1L, 13L, 
    17L, 1L, 21L, 1L, 5L, 21L, 16L, 21L, 5L, 1L, 3L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 19L, 14L, 1L, 1L, 1L, 1L, 5L), till_stop_seq_no = c(47L, 
    36L, 27L, 22L, 36L, 38L, 26L, 31L, 47L, 40L, 22L, 6L, 25L, 
    36L, 26L, 22L, 18L, 22L, 22L, 17L, 47L, 38L, 40L, 8L, 34L, 
    47L, 17L, 20L, 21L, 47L, 14L, 47L, 22L, 22L, 40L, 9L, 21L, 
    19L, 8L, 12L, 34L, 15L, 27L, 39L, 14L, 30L, 22L, 17L, 47L, 
    27L, 10L, 36L, 4L, 25L, 47L, 42L, 22L, 22L, 27L, 31L, 36L, 
    17L, 21L, 21L, 27L, 40L, 39L, 47L, 21L, 47L, 25L, 20L, 20L, 
    19L, 27L, 17L, 19L, 31L, 42L, 27L, 8L, 22L, 17L, 14L, 30L, 
    47L, 22L, 12L, 30L, 25L, 39L, 22L, 20L, 27L, 30L, 22L, 39L, 
    42L, 27L, 10L, 9L, 14L, 39L, 9L, 47L, 42L, 22L, 38L, 21L, 
    31L, 20L, 39L, 22L, 47L, 17L, 17L), from_bus_stop_code_english = structure(c(12L, 
    12L, 12L, 8L, 12L, 5L, 12L, 12L, 8L, 8L, 5L, 8L, 8L, 1L, 
    8L, 8L, 8L, 2L, 8L, 8L, 15L, 15L, 8L, 8L, 8L, 8L, 13L, 8L, 
    8L, 6L, 8L, 3L, 8L, 8L, 3L, 8L, 5L, 8L, 8L, 8L, 4L, 8L, 8L, 
    11L, 8L, 8L, 8L, 8L, 6L, 8L, 8L, 3L, 8L, 8L, 14L, 14L, 8L, 
    8L, 9L, 12L, 2L, 2L, 8L, 8L, 8L, 5L, 8L, 7L, 8L, 8L, 8L, 
    8L, 2L, 8L, 14L, 8L, 8L, 1L, 15L, 12L, 8L, 8L, 8L, 8L, 10L, 
    12L, 8L, 11L, 5L, 8L, 11L, 12L, 8L, 12L, 8L, 13L, 12L, 5L, 
    12L, 13L, 8L, 14L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 4L, 8L, 
    8L, 8L, 8L, 13L), .Label = c("AECSL-1", "DMLUR", "GRPI", 
    "HALKM", "HALMG", "HP F", "ITL", "KBS1", "KUDGT", "MAO", 
    "MPH", "MTHB-1", "MYHL", "RICHMD", "SJRHS"), class = "factor"), 
    till_bus_stop_code_english = structure(c(8L, 17L, 1L, 1L, 
    17L, 6L, 3L, 17L, 8L, 8L, 14L, 15L, 10L, 17L, 3L, 1L, 14L, 
    14L, 14L, 4L, 8L, 6L, 9L, 11L, 7L, 8L, 4L, 10L, 13L, 8L, 
    12L, 8L, 14L, 14L, 8L, 12L, 13L, 5L, 11L, 4L, 7L, 5L, 1L, 
    7L, 12L, 3L, 1L, 4L, 8L, 1L, 2L, 17L, 16L, 10L, 8L, 6L, 1L, 
    1L, 1L, 17L, 17L, 4L, 13L, 13L, 1L, 9L, 7L, 8L, 13L, 8L, 
    10L, 10L, 10L, 5L, 1L, 4L, 5L, 17L, 6L, 1L, 11L, 14L, 4L, 
    12L, 3L, 8L, 1L, 4L, 3L, 10L, 7L, 1L, 10L, 1L, 3L, 14L, 7L, 
    6L, 1L, 2L, 12L, 12L, 7L, 12L, 8L, 6L, 14L, 6L, 13L, 17L, 
    10L, 7L, 14L, 8L, 4L, 4L), .Label = c("AECSL-1", "DMLUR", 
    "GRPI", "HALMG", "HALPS1", "HP F", "ITL", "KDG0", "KDGDBS", 
    "KUDGT", "MAO", "MPH", "MRH", "MTHB-1", "MYHL", "RICHMD", 
    "SJRHS"), class = "factor"), tickets = c(1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 23L, 1L, 12L, 1L, 1L, 
    1L, 12L, 1L, 1L, 1L, 1L, 1L, 23L, 1L, 1L, 1L, 1L, 23L, 1L, 
    1L, 1L, 1L, 23L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 
    1L, 12L, 1L, 12L, 1L, 1L, 44L, 1L, 1L, 1L, 34L, 1L, 1L, 1L, 
    1L, 12L, 12L, 1L, 34L, 1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 
    1L, 1L, 12L, 1L, 12L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 1L, 1L, 12L, 1L, 
    1L, 12L, 1L, 1L, 1L, 1L, 23L, 1L, 1L, 1L, 1L, 1L, 1L, 12L
    ), passengers = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    12L, 1L, 1L, 23L, 1L, 12L, 1L, 1L, 12L, 23L, 1L, 1L, 1L, 
    1L, 1L, 50L, 1L, 1L, 1L, 1L, 23L, 1L, 1L, 12L, 1L, 23L, 1L, 
    1L, 1L, 1L, 12L, 1L, 12L, 1L, 12L, 1L, 1L, 12L, 1L, 12L, 
    1L, 12L, 54L, 1L, 1L, 12L, 34L, 1L, 1L, 1L, 1L, 23L, 12L, 
    12L, 50L, 12L, 1L, 1L, 1L, 1L, 12L, 12L, 1L, 12L, 1L, 12L, 
    12L, 1L, 12L, 1L, 12L, 1L, 12L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 12L, 23L, 1L, 1L, 1L, 1L, 12L, 1L, 1L, 12L, 
    1L, 1L, 1L, 1L, 50L, 1L, 1L, 1L, 12L, 1L, 1L, 12L), total_ticket_amount = c(416L, 
    347L, 158L, 496L, 347L, 465L, 280L, 347L, 14L, 14L, 465L, 
    347L, 496L, 416L, 496L, 81L, 496L, 376L, 113L, 170L, 280L, 
    158L, 14L, 376L, 541L, 433L, 347L, 496L, 465L, 97L, 133L, 
    347L, 496L, 113L, 347L, 133L, 158L, 465L, 376L, 465L, 43L, 
    465L, 113L, 465L, 14L, 541L, 496L, 81L, 97L, 113L, 416L, 
    249L, 233L, 496L, 541L, 146L, 259L, 496L, 97L, 347L, 465L, 
    416L, 81L, 81L, 363L, 81L, 541L, 158L, 465L, 14L, 113L, 113L, 
    416L, 81L, 465L, 81L, 81L, 158L, 316L, 158L, 2L, 496L, 81L, 
    416L, 465L, 416L, 496L, 97L, 416L, 496L, 465L, 158L, 496L, 
    158L, 146L, 133L, 376L, 465L, 158L, 158L, 43L, 347L, 541L, 
    43L, 14L, 14L, 496L, 14L, 306L, 280L, 280L, 541L, 113L, 14L, 
    465L, 528L), ticket_date = structure(c(9L, 12L, 22L, 2L, 
    23L, 3L, 6L, 3L, 8L, 7L, 26L, 22L, 20L, 19L, 2L, 5L, 6L, 
    22L, 12L, 20L, 10L, 3L, 5L, 9L, 5L, 16L, 18L, 1L, 16L, 9L, 
    24L, 22L, 26L, 22L, 6L, 2L, 21L, 22L, 22L, 3L, 3L, 6L, 8L, 
    19L, 26L, 15L, 7L, 26L, 15L, 9L, 22L, 19L, 25L, 16L, 10L, 
    26L, 6L, 1L, 19L, 6L, 14L, 17L, 9L, 20L, 20L, 3L, 13L, 8L, 
    22L, 20L, 8L, 2L, 7L, 24L, 26L, 9L, 9L, 6L, 19L, 20L, 24L, 
    11L, 24L, 9L, 16L, 10L, 4L, 6L, 19L, 26L, 13L, 6L, 7L, 9L, 
    22L, 17L, 20L, 17L, 18L, 17L, 6L, 25L, 22L, 5L, 15L, 15L, 
    24L, 6L, 24L, 1L, 5L, 24L, 20L, 9L, 22L, 17L), .Label = c("2015-11-28", 
    "2015-12-12", "2016-01-09", "2016-01-10", "2016-01-17", "2016-02-13", 
    "2016-02-28", "2016-04-02", "2016-04-03", "2016-04-16", "2016-04-17", 
    "2016-04-30", "2016-05-08", "2016-05-14", "2016-05-29", "2016-06-04", 
    "2016-06-05", "2016-06-12", "2016-06-18", "2016-06-19", "2016-06-25", 
    "2016-06-26", "2016-07-02", "2016-07-03", "2016-07-09", "2016-07-17"
    ), class = "factor")), .Names = c("X", "schedule_no", "trip_no", 
"route_no", "from_stop_seq_no", "till_stop_seq_no", "from_bus_stop_code_english", 
"till_bus_stop_code_english", "tickets", "passengers", "total_ticket_amount", 
"ticket_date"), class = "data.frame", row.names = c(NA, -116L
))

2 个答案:

答案 0 :(得分:1)

这是一种方式。首先用。创建一个中间数据框架 每站上下车的乘客数量

stops <- unique(unlist(kasa1[c('from_stop_seq_no', 'till_stop_seq_no')]))

tmp <- lapply(
    split(kasa1, kasa1$ticket_date),
    function (x) {
        lapply(stops, function (i) {
            data.frame(
                stop = i,
                date = x$ticket_date[1],
                passengers.in = sum(x[x$from_stop_seq_no == i, 'passengers']),
                passengers.out = sum(x[x$till_stop_seq_no == i, 'passengers']))
        })
    })
kasa2 <- do.call(rbind, unlist(unname(tmp), recursive = FALSE))
head(kasa2)
#  stop       date passengers.in passengers.out
#1   21 2015-11-28             0              0
#2    1 2015-11-28             2              0
#3   11 2015-11-28             0              0
#4   17 2015-11-28             0              0
#5   16 2015-11-28             0              0
#6   26 2015-11-28             0              0

现在我们可以计算每站巴士上的乘客数量

sapply(split(kasa2, kasa2$date),
       function (x) {
           with(x[order(x$stop), ],
                cumsum(passengers.in) - cumsum(passengers.out))
       })

答案 1 :(得分:1)

这是另一种使用data.table的方法,可以为您提供所需的结果。我只保留了您感兴趣的三个变量:stop_seq_noocc_dataticket_date但您可以将此方法应用于可能需要的任何变量。我更喜欢使用data.table,因为它对于大型数据集要快得多。您也可以考虑使用某些用户发现的as.IDate更有效率。

library(data.table)
dat <- fread("kasa1.csv")
dat$ticket_date <- as.Date(dat$ticket_date, format = "%Y-%m-%d")

# function to give occ_data for each stop for a given date
func1 <- function(z, dat){
  mydata <- dat[ticket_date == z]
  newdat <- data.table(ticket_date = z, 
                       stop_seq_no = unique(c(mydata$from_stop_seq_no, mydata$till_stop_seq_no)),
                       occ_data = cumsum(sapply(unique(c(mydata$from_stop_seq_no, mydata$till_stop_seq_no)), 
                                                function(x) {
                                                  sum(mydata$passengers[mydata$from_stop_seq_no == x]) - 
                                                    sum(mydata$passengers[mydata$till_stop_seq_no == x])
                                                  } )) )
  return(newdat)
}


list_data <- lapply(unique(dat$ticket_date), function(z) func1(z, dat) ) #apply function by date for all ticket_date in the dataset
merged_data <- rbindlist(list_data, use.names = T, fill = T) # merge results into a single data.table 

# order data by ticket_date and stop_seq_no 
merged_data[order(ticket_date, stop_seq_no)]

编辑: 这是输出:

merged_data[order(ticket_date, stop_seq_no)]
     ticket_date stop_seq_no occ_data
  1:  2015-11-28           1        2
  2:  2015-11-28          19        3
  3:  2015-11-28          20        2
  4:  2015-11-28          22        1
  5:  2015-11-28          31        0
 ---                                 
153:  2016-07-17          17       14
154:  2016-07-17          22       38
155:  2016-07-17          25        0
156:  2016-07-17          27        1
157:  2016-07-17          42        2

希望这有用!