我试图编写一个循环遍历给定数据帧的R函数来过滤它。数据框中的数据包括伦敦地铁中两条线之间的旅行信息,我希望切断最高百分比。这是输入数据的str()函数的输出:
'data.frame': 71748 obs. of 9 variables:
$ depart : Factor w/ 52 levels "Bank","Barkingside",..: 22 22 22 22 22 25 25 25 25 25 ...
$ arrival : Factor w/ 48 levels "Bank","Barkingside",..: 48 43 38 5 8 1 42 48 41 43 ...
$ traveltime : num 433 1102 161 584 891 ...
$ departuretime: POSIXlt, format: "2014-03-24 18:17:20" "2014-03-24 18:17:20" "2014-03-24 18:17:20" ...
$ arrivaltime : POSIXlt, format: "2014-03-24 18:24:33" "2014-03-24 18:35:42" "2014-03-24 18:20:01" ...
$ lcid : Factor w/ 28 levels "1000001","1000002",..: 1 1 1 1 1 1 1 1 1 1 ...
$ tripno : Factor w/ 25 levels "1","10","11",..: 2 2 2 2 2 2 2 2 2 2 ...
$ destination : Factor w/ 18 levels "Debden","Ealing Broadway",..: 3 3 3 3 3 3 3 3 3 3 ...
$ line : Factor w/ 1 level "C": 1 1 1 1 1 1 1 1 1 1 ...
这是我写的函数:
#cut off top percent of travel times for each combination of arrival and
#departure stations to remove outliers
cutOffTopPercent <- function(data, percentage=0.99){
res <- data.frame()
#loop through all combinations of depart and arrival stations
for(i in 1:length(levels(data$depart))){
for(j in 1:length(levels(data$arrival))){
#create variables for departure/arrival station to make code easier to read
departureStation <- levels(data$depart)[i]
arrivalStation <- levels(data$arrival)[j]
#create a subset containing only the current departure and arrival station
dataSubset <- data[data$depart == departureStation & data$arrival == arrivalStation,]
#get top value that's allowed
upperBorder <- getTopPercentileBottom(dataSubset, percentage)
#remove records with values higher than than allowed
dataSubset <- dataSubset[dataSubset$traveltime < upperBorder,]
#glue the subset to the end result
res <- rbind(res,dataSubset)
}
}
return(res)
}
#returns the traveltime that marks where the given percentage of traveltimes starts
getTopPercentileBottom <- function(data, percentile){
upperBorder <- quantile(data$traveltime, probs = percentile)
return(upperBorder)
}
cutOffTopPercent()函数总是返回一个空数据帧。我无法找到错误。我一直试图手动转到这些步骤,但是当我这样做时,所有数据子集都会正确地附加到res数据帧。
任何人都可以看到我做错了什么,或者建议我更好地处理我想要做的事情吗?
编辑:
输入数据中前30条记录的输入:
structure(list(depart = structure(c(22L, 22L, 22L, 22L, 22L,
25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 25L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L), .Label = c("Bank",
"Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill",
"Chancery Lane", "Chigwell", "Debden", "Ealing Broadway", "East Acton",
"Epping", "Fairlop", "Gants Hill", "Grange Hill", "Greenford",
"Hainault", "Hanger Lane", "Holborn", "Holland Park", "Lancaster Gate",
"Leyton", "Leytonstone", "Liverpool Street", "Loughton", "Marble Arch",
"Mile End", "Newbury Park", "Newbury Park Loop", "North Acton",
"North Acton Junction", "Northolt", "Notting Hill Gate", "Oxford Circus",
"Perivale", "Queensway", "Redbridge", "Roding Valley", "Ruislip Gardens",
"Shepherd's Bush", "Shepherds Bush (Central Line)", "Snaresbrook",
"South Ruislip", "South Woodford", "St. Paul's", "Stratford",
"Theydon Bois", "Tottenham Court Road", "Wanstead", "West Acton",
"West Ruislip", "White City", "Woodford"), class = "factor"),
arrival = structure(c(48L, 43L, 38L, 5L, 8L, 1L, 42L, 48L,
41L, 43L, 6L, 38L, 5L, 4L, 16L, 30L, 44L, 20L, 8L, 3L, 24L,
19L, 1L, 42L, 48L, 41L, 43L, 6L, 38L, 5L), .Label = c("Bank",
"Barkingside", "Bethnal Green", "Bond Street", "Buckhurst Hill",
"Chancery Lane", "Chigwell", "Debden", "East Acton", "Fairlop",
"Gants Hill", "Grange Hill", "Greenford", "Hainault", "Hanger Lane",
"Holborn", "Holland Park", "Lancaster Gate", "Leyton", "Leytonstone",
"Liverpool Street", "Loughton", "Marble Arch", "Mile End",
"Newbury Park", "North Acton", "North Acton Junction", "Northolt",
"Notting Hill Gate", "Oxford Circus", "Perivale", "Queensway",
"Redbridge", "Roding Valley", "Ruislip Gardens", "Shepherd's Bush",
"Shepherds Bush (Central Line)", "Snaresbrook", "South Ruislip",
"South Woodford", "St. Paul's", "Stratford", "Theydon Bois",
"Tottenham Court Road", "Wanstead", "West Acton", "White City",
"Woodford"), class = "factor"), traveltime = c(433, 1102,
161, 584, 891, 829, 1473, 2273, 629, 2942, 467, 2001, 2424,
75, 351, 165, 249, 1840, 2731, 1148, 1289, 1653, 580, 1224,
2024, 380, 2693, 218, 1752, 2175), departuretime = structure(list(
sec = c(20, 20, 20, 20, 20, 40, 40, 40, 40, 40, 40, 40,
40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 49, 49, 49, 49,
49, 49, 49, 49), min = c(17L, 17L, 17L, 17L, 17L, 46L,
46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L, 46L,
46L, 46L, 46L, 46L, 46L, 50L, 50L, 50L, 50L, 50L, 50L,
50L, 50L), hour = c(18L, 18L, 18L, 18L, 18L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L
), mon = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), year = c(114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L), wday = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), yday = c(82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L
), isdst = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("sec", "min", "hour",
"mday", "mon", "year", "wday", "yday", "isdst"), class = c("POSIXlt",
"POSIXt"), tzone = "GMT"), arrivaltime = structure(list(sec = c(33,
42, 1, 4, 11, 29, 13, 33, 9, 42, 27, 1, 4, 55, 31, 25, 49,
20, 11, 48, 9, 13, 29, 13, 33, 9, 42, 27, 1, 4), min = c(24L,
35L, 20L, 27L, 32L, 0L, 11L, 24L, 57L, 35L, 54L, 20L, 27L,
47L, 52L, 49L, 50L, 17L, 32L, 5L, 8L, 14L, 0L, 11L, 24L,
57L, 35L, 54L, 20L, 27L), hour = c(18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 17L, 18L, 17L, 18L, 18L, 17L, 17L, 17L, 17L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 17L, 18L, 17L, 18L,
18L), mday = c(24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L,
24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L, 24L), mon = c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
year = c(114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L,
114L, 114L, 114L, 114L), wday = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), yday = c(82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L, 82L,
82L, 82L, 82L, 82L, 82L, 82L, 82L), isdst = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L
)), .Names = c("sec", "min", "hour", "mday", "mon", "year",
"wday", "yday", "isdst"), class = c("POSIXlt", "POSIXt"), tzone = "GMT"),
lcid = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("1000001", "1000002", "1000003",
"1000004", "1000005", "1000006", "1000007", "1000008", "1000009",
"1000010", "1000045", "1000054", "1000070", "1000088", "1000089",
"1000090", "1000097", "1000098", "1000099", "1000100", "1000101",
"1000102", "1000103", "1000104", "1000105", "1000106", "1000107",
"1000109"), class = "factor"), tripno = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
"2", "20", "21", "22", "23", "24", "3", "4", "5", "6", "7",
"8", "81", "9"), class = "factor"), destination = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Debden",
"Ealing Broadway", "Epping", "Grange Hill via Woodford",
"Hainault", "Hainault via Newbury Park", "Hainault via Woodford",
"Leytonstone", "Loughton", "Marble Arch", "Newbury Park",
"North Acton", "Northolt", "Ruislip Gardens", "West Ruislip",
"White City", "Woodford", "Woodford Via Hainault"), class = "factor"),
line = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "C", class = "factor")), .Names = c("depart",
"arrival", "traveltime", "departuretime", "arrivaltime", "lcid",
"tripno", "destination", "line"), row.names = c(NA, 30L), class = "data.frame")
答案 0 :(得分:1)
这是代码的矢量化版本。基本上我使用Map
来避免双循环并填充结果manullay(使用rbind,非常溶解)。
cutOffTopPercent <-
function(data,percent=0.99){
cut_off_dep_arr <-
function(dep,arr){
dataSubset <- data[data$depart == dep & data$arrival == arr,]
upperBorder <- getTopPercentileBottom(dataSubset, percent)
dataSubset[dataSubset$traveltime <= upperBorder,] ## <= not <
}
Map(cut_off_dep_arr,df$depart,df$arrival)
}
cutOffTopPercent(data=df)