R

时间:2017-07-21 04:38:38

标签: r if-statement na missing-data chron

我从去年的Superbowl下载了游戏,并且在有条件地替换和格式化NA时遇到了麻烦。

我主要关注我的数据框中的以下两个向量:

> superbowl$Quarter
  [1] 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
 [18] 1  1  1  1  1  1  1  1  1  1  1  1  2  2  2  2  2 
 [35] 2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2 
 [52] 2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2 
 [69] 2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  3 
 [86] 3  3  3  3     3  3  3  3  3  3  3  3  3  3  3  3 
[103] 3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3  3 
[120] 3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4  4 
[137] 4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4 
[154] 4  4  4  4  4  4  4  4  4  4  4  4  4     4  4  4 
[171] 4  4  4  4  4  4  4  4  4  OT OT OT OT OT OT OT OT
[188] OT OT
Levels:  1 2 3 4 OT
> str(superbowl$Quarter)
 Factor w/ 6 levels "","1","2","3",..: 2 2 2 2 2 2 2 2 2 2 ...

> superbowl$Time
  [1] 15:00 15:00 14:55 14:26 13:47 13:37 12:55 12:16
  [9] 11:32 10:41 10:28 9:56  9:22  8:38  7:56  7:15 
 [17] 6:36  5:55  5:48  5:14  5:08  4:25  3:45  3:04 
 [25] 2:27  1:48  1:36  0:58  0:24  15:00 14:19 14:08
 [33] 13:29 13:11 12:46 12:20 12:20 12:15 12:15 12:10
 [41] 12:04 11:17 10:45 10:37 10:11 9:45  9:00  8:55 
 [49] 8:48  8:48  8:48  8:48  8:12  8:07  8:02  7:34 
 [57] 6:53  6:10  6:01  5:20  5:16  5:16  5:12  4:36 
 [65] 3:59  3:23  2:36        2:21  2:21  2:00  1:43 
 [73] 1:43  1:37  0:59  0:33  0:30  0:23  0:23  0:18 
 [81] 0:12  0:05  0:05  0:02  15:00 14:54 14:11 13:24
 [89] 13:19       13:07 13:02 12:57 12:57 12:52 12:45
 [97] 12:07 11:36 11:06 10:26 9:57  9:28  8:43  8:36 
[105] 8:31  8:31  8:31  7:51  7:13  6:46  6:12  6:04 
[113] 5:35  4:54  4:49  4:07  3:17  2:43  2:12  2:06 
[121] 2:06  2:05  1:30  0:59  0:59  0:52  0:04  15:00
[129] 14:51 14:29 14:00 13:21 13:13 12:47 12:24 12:20
[137] 11:48 11:07 10:25 9:48  9:44  9:40  9:00  8:31 
[145] 8:24  7:41  7:03  6:34  6:00  5:56  5:56  5:53 
[153] 5:18  4:47  4:40  3:56  3:50  3:50  3:44  3:38 
[161] 3:30  3:24  3:17  2:38  2:34  2:28        2:03 
[169] 1:57  1:25  1:00  0:57  0:57  0:52  0:32  0:19 
[177] 0:18  0:11  0:03  15:00 15:00 14:26 13:59 13:12
[185] 12:37 11:56 11:18 11:13 11:08
155 Levels:  0:02 0:03 0:04 0:05 0:11 0:12 0:18 ... 9:57
> str(superbowl$Time)
 Factor w/ 155 levels "","0:02","0:03",..: 79 79 78 74 68 67 56 48 40 32 ...

superbowl$Quartersuperbowl$Time之间的关系是,对于每次观察,superbowl$Timesuperbowl$Quarter代表的季度剩余时间。我希望superbowl$Time代表游戏剩余时间而不是季度剩余时间,以便在不同季度的游戏之间存在恒定的测量标准。为此,我编写了以下代码(其中superbowl$Time.1只是我创建的一个变量,用于在不改变原始向量的情况下对其进行测试):

library(chron)
superbowl$Time <- ms(as.character(superbowl$Time))
superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time

转换效果非常好,除了superbowl$Quarter == "OT"和缺失值的情况(当遇到挑战或额外点时superbowl$Time中出现缺失值的实例)。在这两种情况下,R都会将值转换为NA

library(chron)
> superbowl$Time <- ms(as.character(superbowl$Time))
> superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
Warning message:
NAs introduced by coercion 
> superbowl$Time.1
  [1] "60M 0S"  "60M 0S"  "59M 55S" "59M 26S" "58M 47S"
  [6] "58M 37S" "57M 55S" "57M 16S" "56M 32S" "55M 41S"
 [11] "55M 28S" "54M 56S" "54M 22S" "53M 38S" "52M 56S"
 [16] "52M 15S" "51M 36S" "50M 55S" "50M 48S" "50M 14S"
 [21] "50M 8S"  "49M 25S" "48M 45S" "48M 4S"  "47M 27S"
 [26] "46M 48S" "46M 36S" "45M 58S" "45M 24S" "45M 0S" 
 [31] "44M 19S" "44M 8S"  "43M 29S" "43M 11S" "42M 46S"
 [36] "42M 20S" "42M 20S" "42M 15S" "42M 15S" "42M 10S"
 [41] "42M 4S"  "41M 17S" "40M 45S" "40M 37S" "40M 11S"
 [46] "39M 45S" "39M 0S"  "38M 55S" "38M 48S" "38M 48S"
 [51] "38M 48S" "38M 48S" "38M 12S" "38M 7S"  "38M 2S" 
 [56] "37M 34S" "36M 53S" "36M 10S" "36M 1S"  "35M 20S"
 [61] "35M 16S" "35M 16S" "35M 12S" "34M 36S" "33M 59S"
 [66] "33M 23S" "32M 36S" NA        "32M 21S" "32M 21S"
 [71] "32M 0S"  "31M 43S" "31M 43S" "31M 37S" "30M 59S"
 [76] "30M 33S" "30M 30S" "30M 23S" "30M 23S" "30M 18S"
 [81] "30M 12S" "30M 5S"  "30M 5S"  "30M 2S"  "30M 0S" 
 [86] "29M 54S" "29M 11S" "28M 24S" "28M 19S" NA       
 [91] "28M 7S"  "28M 2S"  "27M 57S" "27M 57S" "27M 52S"
 [96] "27M 45S" "27M 7S"  "26M 36S" "26M 6S"  "25M 26S"
[101] "24M 57S" "24M 28S" "23M 43S" "23M 36S" "23M 31S"
[106] "23M 31S" "23M 31S" "22M 51S" "22M 13S" "21M 46S"
[111] "21M 12S" "21M 4S"  "20M 35S" "19M 54S" "19M 49S"
[116] "19M 7S"  "18M 17S" "17M 43S" "17M 12S" "17M 6S" 
[121] "17M 6S"  "17M 5S"  "16M 30S" "15M 59S" "15M 59S"
[126] "15M 52S" "15M 4S"  "15M 0S"  "14M 51S" "14M 29S"
[131] "14M 0S"  "13M 21S" "13M 13S" "12M 47S" "12M 24S"
[136] "12M 20S" "11M 48S" "11M 7S"  "10M 25S" "9M 48S" 
[141] "9M 44S"  "9M 40S"  "9M 0S"   "8M 31S"  "8M 24S" 
[146] "7M 41S"  "7M 3S"   "6M 34S"  "6M 0S"   "5M 56S" 
[151] "5M 56S"  "5M 53S"  "5M 18S"  "4M 47S"  "4M 40S" 
[156] "3M 56S"  "3M 50S"  "3M 50S"  "3M 44S"  "3M 38S" 
[161] "3M 30S"  "3M 24S"  "3M 17S"  "2M 38S"  "2M 34S" 
[166] "2M 28S"  NA        "2M 3S"   "1M 57S"  "1M 25S" 
[171] "1M 0S"   "57S"     "57S"     "52S"     "32S"    
[176] "19S"     "18S"     "11S"     "3S"      NA       
[181] NA        NA        NA        NA        NA       
[186] NA        NA        NA        NA

我需要帮助有条件地替换和格式化这些NA。我想要的是,在superbowl$Quarter == "OT"的情况下,将值保持原样。在缺少值的情况下,我想用前一个观察中的superbowl$Time值填充它们。

为了解决加班问题,我提出了这个问题:

library(chron)
superbowl$Time <- ms(as.character(superbowl$Time))
superbowl$Time.1 <- ifelse(superbowl$Quarter == "OT", superbowl$Time, (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time)

虽然superbowl$Time.1 <- superbowl$Timesuperbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time这两个参数都可以单独运作:

> superbowl$Time.1 <- superbowl$Time
> superbowl$Time.1
  [1] "15M 0S"  "15M 0S"  "14M 55S" "14M 26S" "13M 47S"
  [6] "13M 37S" "12M 55S" "12M 16S" "11M 32S" "10M 41S"
 [11] "10M 28S" "9M 56S"  "9M 22S"  "8M 38S"  "7M 56S" 
 [16] "7M 15S"  "6M 36S"  "5M 55S"  "5M 48S"  "5M 14S" 
 [21] "5M 8S"   "4M 25S"  "3M 45S"  "3M 4S"   "2M 27S" 
 [26] "1M 48S"  "1M 36S"  "58S"     "24S"     "15M 0S" 
 [31] "14M 19S" "14M 8S"  "13M 29S" "13M 11S" "12M 46S"
 [36] "12M 20S" "12M 20S" "12M 15S" "12M 15S" "12M 10S"
 [41] "12M 4S"  "11M 17S" "10M 45S" "10M 37S" "10M 11S"
 [46] "9M 45S"  "9M 0S"   "8M 55S"  "8M 48S"  "8M 48S" 
 [51] "8M 48S"  "8M 48S"  "8M 12S"  "8M 7S"   "8M 2S"  
 [56] "7M 34S"  "6M 53S"  "6M 10S"  "6M 1S"   "5M 20S" 
 [61] "5M 16S"  "5M 16S"  "5M 12S"  "4M 36S"  "3M 59S" 
 [66] "3M 23S"  "2M 36S"  NA        "2M 21S"  "2M 21S" 
 [71] "2M 0S"   "1M 43S"  "1M 43S"  "1M 37S"  "59S"    
 [76] "33S"     "30S"     "23S"     "23S"     "18S"    
 [81] "12S"     "5S"      "5S"      "2S"      "15M 0S" 
 [86] "14M 54S" "14M 11S" "13M 24S" "13M 19S" NA       
 [91] "13M 7S"  "13M 2S"  "12M 57S" "12M 57S" "12M 52S"
 [96] "12M 45S" "12M 7S"  "11M 36S" "11M 6S"  "10M 26S"
[101] "9M 57S"  "9M 28S"  "8M 43S"  "8M 36S"  "8M 31S" 
[106] "8M 31S"  "8M 31S"  "7M 51S"  "7M 13S"  "6M 46S" 
[111] "6M 12S"  "6M 4S"   "5M 35S"  "4M 54S"  "4M 49S" 
[116] "4M 7S"   "3M 17S"  "2M 43S"  "2M 12S"  "2M 6S"  
[121] "2M 6S"   "2M 5S"   "1M 30S"  "59S"     "59S"    
[126] "52S"     "4S"      "15M 0S"  "14M 51S" "14M 29S"
[131] "14M 0S"  "13M 21S" "13M 13S" "12M 47S" "12M 24S"
[136] "12M 20S" "11M 48S" "11M 7S"  "10M 25S" "9M 48S" 
[141] "9M 44S"  "9M 40S"  "9M 0S"   "8M 31S"  "8M 24S" 
[146] "7M 41S"  "7M 3S"   "6M 34S"  "6M 0S"   "5M 56S" 
[151] "5M 56S"  "5M 53S"  "5M 18S"  "4M 47S"  "4M 40S" 
[156] "3M 56S"  "3M 50S"  "3M 50S"  "3M 44S"  "3M 38S" 
[161] "3M 30S"  "3M 24S"  "3M 17S"  "2M 38S"  "2M 34S" 
[166] "2M 28S"  NA        "2M 3S"   "1M 57S"  "1M 25S" 
[171] "1M 0S"   "57S"     "57S"     "52S"     "32S"    
[176] "19S"     "18S"     "11S"     "3S"      "15M 0S" 
[181] "15M 0S"  "14M 26S" "13M 59S" "13M 12S" "12M 37S"
[186] "11M 56S" "11M 18S" "11M 13S" "11M 8S" 

> superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
Warning message:
NAs introduced by coercion 
> superbowl$Time.1
  [1] "60M 0S"  "60M 0S"  "59M 55S" "59M 26S" "58M 47S"
  [6] "58M 37S" "57M 55S" "57M 16S" "56M 32S" "55M 41S"
 [11] "55M 28S" "54M 56S" "54M 22S" "53M 38S" "52M 56S"
 [16] "52M 15S" "51M 36S" "50M 55S" "50M 48S" "50M 14S"
 [21] "50M 8S"  "49M 25S" "48M 45S" "48M 4S"  "47M 27S"
 [26] "46M 48S" "46M 36S" "45M 58S" "45M 24S" "45M 0S" 
 [31] "44M 19S" "44M 8S"  "43M 29S" "43M 11S" "42M 46S"
 [36] "42M 20S" "42M 20S" "42M 15S" "42M 15S" "42M 10S"
 [41] "42M 4S"  "41M 17S" "40M 45S" "40M 37S" "40M 11S"
 [46] "39M 45S" "39M 0S"  "38M 55S" "38M 48S" "38M 48S"
 [51] "38M 48S" "38M 48S" "38M 12S" "38M 7S"  "38M 2S" 
 [56] "37M 34S" "36M 53S" "36M 10S" "36M 1S"  "35M 20S"
 [61] "35M 16S" "35M 16S" "35M 12S" "34M 36S" "33M 59S"
 [66] "33M 23S" "32M 36S" NA        "32M 21S" "32M 21S"
 [71] "32M 0S"  "31M 43S" "31M 43S" "31M 37S" "30M 59S"
 [76] "30M 33S" "30M 30S" "30M 23S" "30M 23S" "30M 18S"
 [81] "30M 12S" "30M 5S"  "30M 5S"  "30M 2S"  "30M 0S" 
 [86] "29M 54S" "29M 11S" "28M 24S" "28M 19S" NA       
 [91] "28M 7S"  "28M 2S"  "27M 57S" "27M 57S" "27M 52S"
 [96] "27M 45S" "27M 7S"  "26M 36S" "26M 6S"  "25M 26S"
[101] "24M 57S" "24M 28S" "23M 43S" "23M 36S" "23M 31S"
[106] "23M 31S" "23M 31S" "22M 51S" "22M 13S" "21M 46S"
[111] "21M 12S" "21M 4S"  "20M 35S" "19M 54S" "19M 49S"
[116] "19M 7S"  "18M 17S" "17M 43S" "17M 12S" "17M 6S" 
[121] "17M 6S"  "17M 5S"  "16M 30S" "15M 59S" "15M 59S"
[126] "15M 52S" "15M 4S"  "15M 0S"  "14M 51S" "14M 29S"
[131] "14M 0S"  "13M 21S" "13M 13S" "12M 47S" "12M 24S"
[136] "12M 20S" "11M 48S" "11M 7S"  "10M 25S" "9M 48S" 
[141] "9M 44S"  "9M 40S"  "9M 0S"   "8M 31S"  "8M 24S" 
[146] "7M 41S"  "7M 3S"   "6M 34S"  "6M 0S"   "5M 56S" 
[151] "5M 56S"  "5M 53S"  "5M 18S"  "4M 47S"  "4M 40S" 
[156] "3M 56S"  "3M 50S"  "3M 50S"  "3M 44S"  "3M 38S" 
[161] "3M 30S"  "3M 24S"  "3M 17S"  "2M 38S"  "2M 34S" 
[166] "2M 28S"  NA        "2M 3S"   "1M 57S"  "1M 25S" 
[171] "1M 0S"   "57S"     "57S"     "52S"     "32S"    
[176] "19S"     "18S"     "11S"     "3S"      NA       
[181] NA        NA        NA        NA        NA       
[186] NA        NA        NA        NA     

ifelse()函数中,他们不会:

> superbowl$Time.1 <- ifelse(superbowl$Quarter == "OT", superbowl$Time, (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time)
Warning message:
In ifelse(superbowl$Quarter == "OT", superbowl$Time, (4 - as.numeric(as.character(superbowl$Quarter))) *  :
  NAs introduced by coercion
> superbowl$Time.1
  [1]  0  0 55 26 47 37 55 16 32 41 28 56 22 38 56 15 36
 [18] 55 48 14  8 25 45  4 27 48 36 58 24  0 19  8 29 11
 [35] 46 20 20 15 15 10  4 17 45 37 11 45  0 55 48 48 48
 [52] 48 12  7  2 34 53 10  1 20 16 16 12 36 59 23 36 NA
 [69] 21 21  0 43 43 37 59 33 30 23 23 18 12  5  5  2  0
 [86] 54 11 24 19 NA  7  2 57 57 52 45  7 36  6 26 57 28
[103] 43 36 31 31 31 51 13 46 12  4 35 54 49  7 17 43 12
[120]  6  6  5 30 59 59 52  4  0 51 29  0 21 13 47 24 20
[137] 48  7 25 48 44 40  0 31 24 41  3 34  0 56 56 53 18
[154] 47 40 56 50 50 44 38 30 24 17 38 34 28 NA  3 57 25
[171]  0 57 57 52 32 19 18 11  3  0  0 26 59 12 37 56 18
[188] 13  8

我不知道这些数字是什么或来自哪里。它们的格式也不正确。

ifelse()函数对我的代码做了什么?如何让我superbowl$Time.1在1-4区和(4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time)加班superbowl$Time

为了解决缺失值的问题,我提出了以下功能:

f1 <- function(df, cols, match_with, to_x = 'OT'){
  df[cols] <- lapply(df[cols], function(i) 
    ifelse(grepl(to_x, match_with, fixed = TRUE), superbowl$Time[-1], 
           i))
  return(df)
}
superbowl = f1(superbowl, cols = c('Time.1'), match_with = superbowl$Quarter)

我怀疑这会起作用,但这里的问题是我不知道如何在函数中引用先前的观察(观察i-1,当i表示函数运行的每个给定观察时)。参考上述观察结果是我在上面写superbowl$Time[-1]的地方尝试做的事情。这不起作用。它产生ifelse()函数产生的相同数字串:

library(chron)
> superbowl$Time <- ms(as.character(superbowl$Time))
> superbowl$Time.1 <- (4 - as.numeric(as.character(superbowl$Quarter)))*ms("15:00") + superbowl$Time
Warning message:
NAs introduced by coercion 
> f1 <- function(df, cols, match_with, to_x = 'OT'){
+   df[cols] <- lapply(df[cols], function(i) 
+     ifelse(grepl(to_x, match_with, fixed = TRUE), superbowl$Time[-1], 
+            i))
+   return(df)
+ }
> superbowl = f1(superbowl, cols = c('Time.1'), match_with = superbowl$Quarter)
> 
> superbowl$Time.1
  [1]  0  0 55 26 47 37 55 16 32 41 28 56 22 38 56 15 36
 [18] 55 48 14  8 25 45  4 27 48 36 58 24  0 19  8 29 11
 [35] 46 20 20 15 15 10  4 17 45 37 11 45  0 55 48 48 48
 [52] 48 12  7  2 34 53 10  1 20 16 16 12 36 59 23 36 NA
 [69] 21 21  0 43 43 37 59 33 30 23 23 18 12  5  5  2  0
 [86] 54 11 24 19 NA  7  2 57 57 52 45  7 36  6 26 57 28
[103] 43 36 31 31 31 51 13 46 12  4 35 54 49  7 17 43 12
[120]  6  6  5 30 59 59 52  4  0 51 29  0 21 13 47 24 20
[137] 48  7 25 48 44 40  0 31 24 41  3 34  0 56 56 53 18
[154] 47 40 56 50 50 44 38 30 24 17 38 34 28 NA  3 57 25
[171]  0 57 57 52 32 19 18 11  3  0 26 59 12 37 56 18 13
[188]  8  0

在计算当前观察值时,是否有一种简单的方法可以让函数引用先前的观察结果?例如,superbowl$Time的观察68应为2m 36s,与superbowl$Time的观察67相同,而不是NA

非常感谢你的帮助!

0 个答案:

没有答案