我从去年的Superbowl下载了游戏,并且在有条件地替换和格式化NA
时遇到了麻烦。
我主要关注我的数据框中的以下两个向量:
> superbowl$Quarter
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[18] 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
[35] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[52] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[69] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3
[86] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[103] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[120] 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 4 4
[137] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[154] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
[171] 4 4 4 4 4 4 4 4 4 OT OT OT OT OT OT OT OT
[188] OT OT
Levels: 1 2 3 4 OT
> str(superbowl$Quarter)
Factor w/ 6 levels "","1","2","3",..: 2 2 2 2 2 2 2 2 2 2 ...
> superbowl$Time
[1] 15:00 15:00 14:55 14:26 13:47 13:37 12:55 12:16
[9] 11:32 10:41 10:28 9:56 9:22 8:38 7:56 7:15
[17] 6:36 5:55 5:48 5:14 5:08 4:25 3:45 3:04
[25] 2:27 1:48 1:36 0:58 0:24 15:00 14:19 14:08
[33] 13:29 13:11 12:46 12:20 12:20 12:15 12:15 12:10
[41] 12:04 11:17 10:45 10:37 10:11 9:45 9:00 8:55
[49] 8:48 8:48 8:48 8:48 8:12 8:07 8:02 7:34
[57] 6:53 6:10 6:01 5:20 5:16 5:16 5:12 4:36
[65] 3:59 3:23 2:36 2:21 2:21 2:00 1:43
[73] 1:43 1:37 0:59 0:33 0:30 0:23 0:23 0:18
[81] 0:12 0:05 0:05 0:02 15:00 14:54 14:11 13:24
[89] 13:19 13:07 13:02 12:57 12:57 12:52 12:45
[97] 12:07 11:36 11:06 10:26 9:57 9:28 8:43 8:36
[105] 8:31 8:31 8:31 7:51 7:13 6:46 6:12 6:04
[113] 5:35 4:54 4:49 4:07 3:17 2:43 2:12 2:06
[121] 2:06 2:05 1:30 0:59 0:59 0:52 0:04 15:00
[129] 14:51 14:29 14:00 13:21 13:13 12:47 12:24 12:20
[137] 11:48 11:07 10:25 9:48 9:44 9:40 9:00 8:31
[145] 8:24 7:41 7:03 6:34 6:00 5:56 5:56 5:53
[153] 5:18 4:47 4:40 3:56 3:50 3:50 3:44 3:38
[161] 3:30 3:24 3:17 2:38 2:34 2:28 2:03
[169] 1:57 1:25 1:00 0:57 0:57 0:52 0:32 0:19
[177] 0:18 0:11 0:03 15:00 15:00 14:26 13:59 13:12
[185] 12:37 11:56 11:18 11:13 11:08
155 Levels: 0:02 0:03 0:04 0:05 0:11 0:12 0:18 ... 9:57
> str(superbowl$Time)
Factor w/ 155 levels "","0:02","0:03",..: 79 79 78 74 68 67 56 48 40 32 ...
superbowl$Quarter
和superbowl$Time
之间的关系是,对于每次观察,superbowl$Time
是superbowl$Quarter
代表的季度剩余时间。我希望superbowl$Time
代表游戏剩余时间而不是季度剩余时间,以便在不同季度的游戏之间存在恒定的测量标准。为此,我编写了以下代码(其中superbowl$Time.1
只是我创建的一个变量,用于在不改变原始向量的情况下对其进行测试):
library(chron)
superbowl$Time <- ms(as.character(superbowl$Time))
superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
转换效果非常好,除了superbowl$Quarter == "OT"
和缺失值的情况(当遇到挑战或额外点时superbowl$Time
中出现缺失值的实例)。在这两种情况下,R都会将值转换为NA
:
library(chron)
> superbowl$Time <- ms(as.character(superbowl$Time))
> superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
Warning message:
NAs introduced by coercion
> superbowl$Time.1
[1] "60M 0S" "60M 0S" "59M 55S" "59M 26S" "58M 47S"
[6] "58M 37S" "57M 55S" "57M 16S" "56M 32S" "55M 41S"
[11] "55M 28S" "54M 56S" "54M 22S" "53M 38S" "52M 56S"
[16] "52M 15S" "51M 36S" "50M 55S" "50M 48S" "50M 14S"
[21] "50M 8S" "49M 25S" "48M 45S" "48M 4S" "47M 27S"
[26] "46M 48S" "46M 36S" "45M 58S" "45M 24S" "45M 0S"
[31] "44M 19S" "44M 8S" "43M 29S" "43M 11S" "42M 46S"
[36] "42M 20S" "42M 20S" "42M 15S" "42M 15S" "42M 10S"
[41] "42M 4S" "41M 17S" "40M 45S" "40M 37S" "40M 11S"
[46] "39M 45S" "39M 0S" "38M 55S" "38M 48S" "38M 48S"
[51] "38M 48S" "38M 48S" "38M 12S" "38M 7S" "38M 2S"
[56] "37M 34S" "36M 53S" "36M 10S" "36M 1S" "35M 20S"
[61] "35M 16S" "35M 16S" "35M 12S" "34M 36S" "33M 59S"
[66] "33M 23S" "32M 36S" NA "32M 21S" "32M 21S"
[71] "32M 0S" "31M 43S" "31M 43S" "31M 37S" "30M 59S"
[76] "30M 33S" "30M 30S" "30M 23S" "30M 23S" "30M 18S"
[81] "30M 12S" "30M 5S" "30M 5S" "30M 2S" "30M 0S"
[86] "29M 54S" "29M 11S" "28M 24S" "28M 19S" NA
[91] "28M 7S" "28M 2S" "27M 57S" "27M 57S" "27M 52S"
[96] "27M 45S" "27M 7S" "26M 36S" "26M 6S" "25M 26S"
[101] "24M 57S" "24M 28S" "23M 43S" "23M 36S" "23M 31S"
[106] "23M 31S" "23M 31S" "22M 51S" "22M 13S" "21M 46S"
[111] "21M 12S" "21M 4S" "20M 35S" "19M 54S" "19M 49S"
[116] "19M 7S" "18M 17S" "17M 43S" "17M 12S" "17M 6S"
[121] "17M 6S" "17M 5S" "16M 30S" "15M 59S" "15M 59S"
[126] "15M 52S" "15M 4S" "15M 0S" "14M 51S" "14M 29S"
[131] "14M 0S" "13M 21S" "13M 13S" "12M 47S" "12M 24S"
[136] "12M 20S" "11M 48S" "11M 7S" "10M 25S" "9M 48S"
[141] "9M 44S" "9M 40S" "9M 0S" "8M 31S" "8M 24S"
[146] "7M 41S" "7M 3S" "6M 34S" "6M 0S" "5M 56S"
[151] "5M 56S" "5M 53S" "5M 18S" "4M 47S" "4M 40S"
[156] "3M 56S" "3M 50S" "3M 50S" "3M 44S" "3M 38S"
[161] "3M 30S" "3M 24S" "3M 17S" "2M 38S" "2M 34S"
[166] "2M 28S" NA "2M 3S" "1M 57S" "1M 25S"
[171] "1M 0S" "57S" "57S" "52S" "32S"
[176] "19S" "18S" "11S" "3S" NA
[181] NA NA NA NA NA
[186] NA NA NA NA
我需要帮助有条件地替换和格式化这些NA
。我想要的是,在superbowl$Quarter == "OT"
的情况下,将值保持原样。在缺少值的情况下,我想用前一个观察中的superbowl$Time
值填充它们。
为了解决加班问题,我提出了这个问题:
library(chron)
superbowl$Time <- ms(as.character(superbowl$Time))
superbowl$Time.1 <- ifelse(superbowl$Quarter == "OT", superbowl$Time, (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time)
虽然superbowl$Time.1 <- superbowl$Time
和superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
这两个参数都可以单独运作:
> superbowl$Time.1 <- superbowl$Time
> superbowl$Time.1
[1] "15M 0S" "15M 0S" "14M 55S" "14M 26S" "13M 47S"
[6] "13M 37S" "12M 55S" "12M 16S" "11M 32S" "10M 41S"
[11] "10M 28S" "9M 56S" "9M 22S" "8M 38S" "7M 56S"
[16] "7M 15S" "6M 36S" "5M 55S" "5M 48S" "5M 14S"
[21] "5M 8S" "4M 25S" "3M 45S" "3M 4S" "2M 27S"
[26] "1M 48S" "1M 36S" "58S" "24S" "15M 0S"
[31] "14M 19S" "14M 8S" "13M 29S" "13M 11S" "12M 46S"
[36] "12M 20S" "12M 20S" "12M 15S" "12M 15S" "12M 10S"
[41] "12M 4S" "11M 17S" "10M 45S" "10M 37S" "10M 11S"
[46] "9M 45S" "9M 0S" "8M 55S" "8M 48S" "8M 48S"
[51] "8M 48S" "8M 48S" "8M 12S" "8M 7S" "8M 2S"
[56] "7M 34S" "6M 53S" "6M 10S" "6M 1S" "5M 20S"
[61] "5M 16S" "5M 16S" "5M 12S" "4M 36S" "3M 59S"
[66] "3M 23S" "2M 36S" NA "2M 21S" "2M 21S"
[71] "2M 0S" "1M 43S" "1M 43S" "1M 37S" "59S"
[76] "33S" "30S" "23S" "23S" "18S"
[81] "12S" "5S" "5S" "2S" "15M 0S"
[86] "14M 54S" "14M 11S" "13M 24S" "13M 19S" NA
[91] "13M 7S" "13M 2S" "12M 57S" "12M 57S" "12M 52S"
[96] "12M 45S" "12M 7S" "11M 36S" "11M 6S" "10M 26S"
[101] "9M 57S" "9M 28S" "8M 43S" "8M 36S" "8M 31S"
[106] "8M 31S" "8M 31S" "7M 51S" "7M 13S" "6M 46S"
[111] "6M 12S" "6M 4S" "5M 35S" "4M 54S" "4M 49S"
[116] "4M 7S" "3M 17S" "2M 43S" "2M 12S" "2M 6S"
[121] "2M 6S" "2M 5S" "1M 30S" "59S" "59S"
[126] "52S" "4S" "15M 0S" "14M 51S" "14M 29S"
[131] "14M 0S" "13M 21S" "13M 13S" "12M 47S" "12M 24S"
[136] "12M 20S" "11M 48S" "11M 7S" "10M 25S" "9M 48S"
[141] "9M 44S" "9M 40S" "9M 0S" "8M 31S" "8M 24S"
[146] "7M 41S" "7M 3S" "6M 34S" "6M 0S" "5M 56S"
[151] "5M 56S" "5M 53S" "5M 18S" "4M 47S" "4M 40S"
[156] "3M 56S" "3M 50S" "3M 50S" "3M 44S" "3M 38S"
[161] "3M 30S" "3M 24S" "3M 17S" "2M 38S" "2M 34S"
[166] "2M 28S" NA "2M 3S" "1M 57S" "1M 25S"
[171] "1M 0S" "57S" "57S" "52S" "32S"
[176] "19S" "18S" "11S" "3S" "15M 0S"
[181] "15M 0S" "14M 26S" "13M 59S" "13M 12S" "12M 37S"
[186] "11M 56S" "11M 18S" "11M 13S" "11M 8S"
> superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
Warning message:
NAs introduced by coercion
> superbowl$Time.1
[1] "60M 0S" "60M 0S" "59M 55S" "59M 26S" "58M 47S"
[6] "58M 37S" "57M 55S" "57M 16S" "56M 32S" "55M 41S"
[11] "55M 28S" "54M 56S" "54M 22S" "53M 38S" "52M 56S"
[16] "52M 15S" "51M 36S" "50M 55S" "50M 48S" "50M 14S"
[21] "50M 8S" "49M 25S" "48M 45S" "48M 4S" "47M 27S"
[26] "46M 48S" "46M 36S" "45M 58S" "45M 24S" "45M 0S"
[31] "44M 19S" "44M 8S" "43M 29S" "43M 11S" "42M 46S"
[36] "42M 20S" "42M 20S" "42M 15S" "42M 15S" "42M 10S"
[41] "42M 4S" "41M 17S" "40M 45S" "40M 37S" "40M 11S"
[46] "39M 45S" "39M 0S" "38M 55S" "38M 48S" "38M 48S"
[51] "38M 48S" "38M 48S" "38M 12S" "38M 7S" "38M 2S"
[56] "37M 34S" "36M 53S" "36M 10S" "36M 1S" "35M 20S"
[61] "35M 16S" "35M 16S" "35M 12S" "34M 36S" "33M 59S"
[66] "33M 23S" "32M 36S" NA "32M 21S" "32M 21S"
[71] "32M 0S" "31M 43S" "31M 43S" "31M 37S" "30M 59S"
[76] "30M 33S" "30M 30S" "30M 23S" "30M 23S" "30M 18S"
[81] "30M 12S" "30M 5S" "30M 5S" "30M 2S" "30M 0S"
[86] "29M 54S" "29M 11S" "28M 24S" "28M 19S" NA
[91] "28M 7S" "28M 2S" "27M 57S" "27M 57S" "27M 52S"
[96] "27M 45S" "27M 7S" "26M 36S" "26M 6S" "25M 26S"
[101] "24M 57S" "24M 28S" "23M 43S" "23M 36S" "23M 31S"
[106] "23M 31S" "23M 31S" "22M 51S" "22M 13S" "21M 46S"
[111] "21M 12S" "21M 4S" "20M 35S" "19M 54S" "19M 49S"
[116] "19M 7S" "18M 17S" "17M 43S" "17M 12S" "17M 6S"
[121] "17M 6S" "17M 5S" "16M 30S" "15M 59S" "15M 59S"
[126] "15M 52S" "15M 4S" "15M 0S" "14M 51S" "14M 29S"
[131] "14M 0S" "13M 21S" "13M 13S" "12M 47S" "12M 24S"
[136] "12M 20S" "11M 48S" "11M 7S" "10M 25S" "9M 48S"
[141] "9M 44S" "9M 40S" "9M 0S" "8M 31S" "8M 24S"
[146] "7M 41S" "7M 3S" "6M 34S" "6M 0S" "5M 56S"
[151] "5M 56S" "5M 53S" "5M 18S" "4M 47S" "4M 40S"
[156] "3M 56S" "3M 50S" "3M 50S" "3M 44S" "3M 38S"
[161] "3M 30S" "3M 24S" "3M 17S" "2M 38S" "2M 34S"
[166] "2M 28S" NA "2M 3S" "1M 57S" "1M 25S"
[171] "1M 0S" "57S" "57S" "52S" "32S"
[176] "19S" "18S" "11S" "3S" NA
[181] NA NA NA NA NA
[186] NA NA NA NA
在ifelse()
函数中,他们不会:
> superbowl$Time.1 <- ifelse(superbowl$Quarter == "OT", superbowl$Time, (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time)
Warning message:
In ifelse(superbowl$Quarter == "OT", superbowl$Time, (4 - as.numeric(as.character(superbowl$Quarter))) * :
NAs introduced by coercion
> superbowl$Time.1
[1] 0 0 55 26 47 37 55 16 32 41 28 56 22 38 56 15 36
[18] 55 48 14 8 25 45 4 27 48 36 58 24 0 19 8 29 11
[35] 46 20 20 15 15 10 4 17 45 37 11 45 0 55 48 48 48
[52] 48 12 7 2 34 53 10 1 20 16 16 12 36 59 23 36 NA
[69] 21 21 0 43 43 37 59 33 30 23 23 18 12 5 5 2 0
[86] 54 11 24 19 NA 7 2 57 57 52 45 7 36 6 26 57 28
[103] 43 36 31 31 31 51 13 46 12 4 35 54 49 7 17 43 12
[120] 6 6 5 30 59 59 52 4 0 51 29 0 21 13 47 24 20
[137] 48 7 25 48 44 40 0 31 24 41 3 34 0 56 56 53 18
[154] 47 40 56 50 50 44 38 30 24 17 38 34 28 NA 3 57 25
[171] 0 57 57 52 32 19 18 11 3 0 0 26 59 12 37 56 18
[188] 13 8
我不知道这些数字是什么或来自哪里。它们的格式也不正确。
ifelse()
函数对我的代码做了什么?如何让我superbowl$Time.1
在1-4区和(4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time)
加班superbowl$Time
?
为了解决缺失值的问题,我提出了以下功能:
f1 <- function(df, cols, match_with, to_x = 'OT'){
df[cols] <- lapply(df[cols], function(i)
ifelse(grepl(to_x, match_with, fixed = TRUE), superbowl$Time[-1],
i))
return(df)
}
superbowl = f1(superbowl, cols = c('Time.1'), match_with = superbowl$Quarter)
我怀疑这会起作用,但这里的问题是我不知道如何在函数中引用先前的观察(观察i-1,当i表示函数运行的每个给定观察时)。参考上述观察结果是我在上面写superbowl$Time[-1]
的地方尝试做的事情。这不起作用。它产生ifelse()
函数产生的相同数字串:
library(chron)
> superbowl$Time <- ms(as.character(superbowl$Time))
> superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
Warning message:
NAs introduced by coercion
> f1 <- function(df, cols, match_with, to_x = 'OT'){
+ df[cols] <- lapply(df[cols], function(i)
+ ifelse(grepl(to_x, match_with, fixed = TRUE), superbowl$Time[-1],
+ i))
+ return(df)
+ }
> superbowl = f1(superbowl, cols = c('Time.1'), match_with = superbowl$Quarter)
>
> superbowl$Time.1
[1] 0 0 55 26 47 37 55 16 32 41 28 56 22 38 56 15 36
[18] 55 48 14 8 25 45 4 27 48 36 58 24 0 19 8 29 11
[35] 46 20 20 15 15 10 4 17 45 37 11 45 0 55 48 48 48
[52] 48 12 7 2 34 53 10 1 20 16 16 12 36 59 23 36 NA
[69] 21 21 0 43 43 37 59 33 30 23 23 18 12 5 5 2 0
[86] 54 11 24 19 NA 7 2 57 57 52 45 7 36 6 26 57 28
[103] 43 36 31 31 31 51 13 46 12 4 35 54 49 7 17 43 12
[120] 6 6 5 30 59 59 52 4 0 51 29 0 21 13 47 24 20
[137] 48 7 25 48 44 40 0 31 24 41 3 34 0 56 56 53 18
[154] 47 40 56 50 50 44 38 30 24 17 38 34 28 NA 3 57 25
[171] 0 57 57 52 32 19 18 11 3 0 26 59 12 37 56 18 13
[188] 8 0
在计算当前观察值时,是否有一种简单的方法可以让函数引用先前的观察结果?例如,superbowl$Time
的观察68应为2m 36s
,与superbowl$Time
的观察67相同,而不是或
NA
。
非常感谢你的帮助!