时间序列拆分训练并测试80%

时间:2018-12-09 13:27:48

标签: r dplyr

我正在尝试将以下时间序列拆分为训练和测试集。

最初(当数据不是xts对象时)以下工作:

train_date <- nrow(GOOG) *0.8

train_date <- GOOG %>%
  slice(train_date) %>%
  select(date) %>%
  pull()

train <- GOOG %>%
  filter(date < ymd(train_date))

test <- GOOG %>%
  filter(date >= ymd(train_date))

即将数据的前80%作为训练,然后将其余20%作为测试,但以80%的标记提取日期,所有小于此日期的日期都是训练数据。

除了保留xts对象并使用index(GOOG)

之外,是否可以这样做?

数据:

GOOG <- structure(c(921.77002, 929.539978, 938.679993, 946.289978, 952, 
957, 953, 967.840027, 975, 962.25, 972.219971, 953.809998, 954.679993, 
951.780029, 929.400024, 941.890015, 932.380005, 928.609985, 930.340027, 
926.75, 929.059998, 927.090027, 920.609985, 917.549988, 907.969971, 
922.530029, 924.22998, 925.289978, 925.780029, 910.309998, 910, 
912.719971, 921.929993, 928.659973, 923.48999, 916, 905.099976, 
920.049988, 931.76001, 941.130005, 933.080017, 930.150024, 931.72998, 
936.48999, 934.25, 932.590027, 930.659973, 931.25, 924.659973, 
920.01001, 917.419983, 922.97998, 933, 927.75, 925.450012, 923.719971, 
927.73999, 941.359985, 952, 959.97998, 954, 957, 955.48999, 966.700012, 
980, 980, 973.719971, 987.450012, 992, 992.099976, 990.289978, 
991.77002, 986, 989.440002, 989.52002, 970, 968.369995, 980, 
1009.190002, 1014, 1015.219971, 1017.210022, 1021.76001, 1022.109985, 
1028.98999, 1027.27002, 1030.52002, 1033.98999, 1026.459961, 
1023.419983, 1022.590027, 1019.210022, 1022.52002, 1034.01001, 
1020.26001, 1023.309998, 1035, 1035.869995, 1040, 1055.089966, 
930.380005, 931.429993, 946.299988, 954.450012, 956.909973, 960.73999, 
968.039978, 973.039978, 975.900024, 973.22998, 986.200012, 959.700012, 
955, 951.780029, 943.830017, 943.590027, 937.447021, 932.599976, 
932.23999, 930.307007, 931.700012, 935.814026, 925.97998, 919.26001, 
917.780029, 924.66803, 926.549988, 932.700012, 926.859985, 915.275024, 
913, 925.859985, 929.929993, 930.840027, 925.554993, 919.244995, 
923.330017, 930.81897, 941.97998, 942.47998, 937, 930.914978, 
936.409973, 936.98999, 938.380005, 933.47998, 937.25, 932.77002, 
926.48999, 922.080017, 922.419983, 933.880005, 936.530029, 934.72998, 
926.400024, 930.820007, 949.900024, 950.690002, 959.786011, 962.539978, 
958, 960.390015, 970.909973, 979.460022, 985.424988, 981.570007, 
990.710022, 994.119995, 997.210022, 993.906982, 996.440002, 996.719971, 
988.880005, 991, 989.52002, 972.22998, 976.090027, 987.599976, 
1048.390015, 1024.969971, 1024, 1029.670044, 1028.089966, 1032.650024, 
1034.869995, 1033.969971, 1043.521973, 1033.98999, 1030.76001, 
1031.579956, 1026.810059, 1024.089966, 1035.920044, 1034.420044, 
1022.609985, 1035.109985, 1039.706055, 1043.177979, 1055.459961, 
1062.375, 919.590027, 922, 934.469971, 943.01001, 948.005005, 
949.241028, 950.599976, 964.030029, 961.51001, 960.150024, 970.77002, 
945.400024, 942.278992, 920, 927.5, 926.039978, 929.26001, 916.679993, 
922.23999, 923.030029, 926.5, 925.609985, 917.25, 906.130005, 
905.580017, 918.190002, 919.820007, 923.445007, 910.97998, 907.153992, 
903.400024, 911.474976, 919.359985, 915.5, 915.5, 911.869995, 
905, 919.650024, 931.76001, 935.150024, 921.960022, 919.27002, 
923.619995, 924.880005, 926.919983, 923.861023, 929.859985, 924, 
916.359985, 910.599976, 912.549988, 922, 923.830017, 926.47998, 
909.700012, 921.140015, 927.73999, 940.549988, 951.51001, 947.840027, 
949.140015, 950.690002, 955.179993, 963.359985, 976.109985, 966.080017, 
972.25, 985, 989, 984, 988.590027, 986.974976, 978.390015, 984.580017, 
966.119995, 961, 960.52002, 972.200012, 1008.200012, 1007.5, 
1010.419983, 1016.950012, 1013.01001, 1020.309998, 1025, 1025.130005, 
1028.449951, 1019.666016, 1025.280029, 1022.570007, 1014.150024, 
1015.419983, 1022.52002, 1017.75, 1017.5, 1022.655029, 1031.430054, 
1035, 1038.439941, 1040, 928.799988, 930.090027, 943.830017, 
947.159973, 955.98999, 953.419983, 965.400024, 970.890015, 968.150024, 
972.919983, 980.340027, 950.700012, 947.799988, 934.090027, 941.530029, 
930.5, 930.830017, 930.390015, 923.650024, 927.960022, 929.359985, 
926.789978, 922.900024, 907.23999, 914.390015, 922.669983, 922.219971, 
926.960022, 910.97998, 910.669983, 906.659973, 924.690002, 927, 
921.280029, 915.890015, 913.809998, 921.289978, 929.570007, 939.330017, 
937.340027, 928.450012, 927.809998, 935.950012, 926.5, 929.080017, 
932.070007, 935.090027, 925.109985, 920.289978, 915, 921.809998, 
931.580017, 932.450012, 928.530029, 920.969971, 924.859985, 944.48999, 
949.5, 959.109985, 953.27002, 957.789978, 951.679993, 969.960022, 
978.890015, 977, 972.599976, 989.25, 987.830017, 989.679993, 
992, 992.179993, 992.809998, 984.450012, 988.200012, 968.450012, 
970.539978, 973.330017, 972.559998, 1019.27002, 1017.109985, 
1016.640015, 1025.5, 1025.579956, 1032.47998, 1025.900024, 1033.329956, 
1039.849976, 1031.26001, 1028.069946, 1025.75, 1026, 1020.909973, 
1032.5, 1019.090027, 1018.380005, 1034.48999, 1035.959961, 1040.609985, 
1054.209961, 1047.410034, 1192800, 1113200, 1532100, 1294700, 
1053800, 1165500, 1154000, 1224500, 1624500, 1711000, 3248300, 
4661000, 2088300, 3213000, 1846400, 1970100, 1277700, 1824400, 
1202500, 1082300, 1032200, 1061600, 1192100, 1824000, 1206800, 
1064500, 883400, 1006700, 1277200, 1342700, 943400, 1166700, 
1090200, 1270300, 1053400, 1086500, 1185600, 1301200, 1582600, 
947400, 1326400, 1527700, 1212700, 1011500, 1267000, 1134400, 
1102600, 1397600, 2505400, 1306900, 936700, 1669800, 1290600, 
1052700, 1856800, 1666900, 2239400, 1020300, 1581000, 1283400, 
888300, 952400, 1213800, 1173900, 891400, 968400, 1693300, 1262400, 
1169800, 910500, 1290200, 1057600, 1313600, 1183200, 1478400, 
1212200, 1211300, 2042100, 5167700, 2085100, 1330700, 1373400, 
1049000, 1076400, 1125200, 1112300, 1088700, 1245200, 720000, 
885800, 959200, 854000, 1129700, 1397100, 953500, 1097000, 746300, 
537000, 1307900, 1424400, 928.799988, 930.090027, 943.830017, 
947.159973, 955.98999, 953.419983, 965.400024, 970.890015, 968.150024, 
972.919983, 980.340027, 950.700012, 947.799988, 934.090027, 941.530029, 
930.5, 930.830017, 930.390015, 923.650024, 927.960022, 929.359985, 
926.789978, 922.900024, 907.23999, 914.390015, 922.669983, 922.219971, 
926.960022, 910.97998, 910.669983, 906.659973, 924.690002, 927, 
921.280029, 915.890015, 913.809998, 921.289978, 929.570007, 939.330017, 
937.340027, 928.450012, 927.809998, 935.950012, 926.5, 929.080017, 
932.070007, 935.090027, 925.109985, 920.289978, 915, 921.809998, 
931.580017, 932.450012, 928.530029, 920.969971, 924.859985, 944.48999, 
949.5, 959.109985, 953.27002, 957.789978, 951.679993, 969.960022, 
978.890015, 977, 972.599976, 989.25, 987.830017, 989.679993, 
992, 992.179993, 992.809998, 984.450012, 988.200012, 968.450012, 
970.539978, 973.330017, 972.559998, 1019.27002, 1017.109985, 
1016.640015, 1025.5, 1025.579956, 1032.47998, 1025.900024, 1033.329956, 
1039.849976, 1031.26001, 1028.069946, 1025.75, 1026, 1020.909973, 
1032.5, 1019.090027, 1018.380005, 1034.48999, 1035.959961, 1040.609985, 
1054.209961, 1047.410034, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 
1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 
1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 
0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 
0, 0, 0, 1, 0.00762659649095543, 0.00138893089649783, 0.0147727527455792, 
0.00352813106176098, 0.00932262474313839, -0.00268831998962671, 
0.0125653344943579, 0.0056867525000186, -0.00282214355660038, 
0.00492688001007568, 0.00762657169104508, -0.0302344229386443, 
-0.00305040913368582, -0.0144650360556874, 0.00796497316633915, 
-0.0117150050027772, 0.000354666308436391, -0.000472698550717299, 
-0.00724426411648449, 0.00466626740433007, 0.00150864581103693, 
-0.00276535146926948, -0.00419723356136681, -0.0169682886474819, 
0.00788107345224054, 0.0090551819947422, -0.000487728015749256, 
0.00513982688410031, -0.0172391922205249, -0.000340289585727183, 
-0.00440336244178141, 0.0198862082113775, 0.00249813234165353, 
-0.00617041100323623, -0.00585057076060858, -0.00227103360221692, 
0.00818548715419065, 0.00898742979704914, 0.0104994889319832, 
-0.00211852060935469, -0.00948430104756415, -0.000689335981181594, 
0.00877336309971533, -0.010096705891169, 0.00278469185105235, 
0.00321822657391202, 0.00324012142577179, -0.0106728140733341, 
-0.00521019887165097, -0.00574816430305625, 0.00744262076502733, 
0.0105987340354277, 0.000933891865565917, -0.00420396047997473, 
-0.00814196392564925, 0.004223822841668, 0.0212248397793964, 
0.00530446066453272, 0.0101211005792523, -0.00608894192671761, 
0.00474152958256258, -0.00637925342751922, 0.0192081678026828, 
0.00920655779357471, -0.00193077360177174, -0.0045036069600819, 
0.0171190874057765, -0.00143541369724542, 0.00187276754923715, 
0.0023441991516544, 0.000181444556451638, 0.000634970473547991, 
-0.00842052962484363, 0.00380923353577045, -0.0199858325846691, 
0.00215805253147128, 0.00287472856682269, -0.000791118106450051, 
0.0480279078885169, -0.00211919801192617, -0.000462064090345216, 
0.00871496780500025, 7.7967820575342e-05, 0.00672792400010591, 
-0.00637296231157924, 0.00724235483593283, 0.00630971739679231, 
-0.00826077434077854, -0.0030933653676728, -0.00225660326812049, 
0.000243724104313836, -0.0049610399610136, 0.0113526435303026, 
-0.0129878673123487, -0.00069672156648426, 0.0158192275191027, 
0.00142096203366848, 0.00448861362895858, 0.0130692345797547, 
-0.00645025872602245), class = c("xts", "zoo"), .indexCLASS = "Date", .indexTZ = "UTC", tclass = "Date", tzone = "UTC", src = "yahoo", updated = structure(1544361775.45962, class = c("POSIXct", 
"POSIXt")), index = structure(c(1499644800, 1499731200, 1499817600, 
1499904000, 1499990400, 1500249600, 1500336000, 1500422400, 1500508800, 
1500595200, 1500854400, 1500940800, 1501027200, 1501113600, 1501200000, 
1501459200, 1501545600, 1501632000, 1501718400, 1501804800, 1502064000, 
1502150400, 1502236800, 1502323200, 1502409600, 1502668800, 1502755200, 
1502841600, 1502928000, 1503014400, 1503273600, 1503360000, 1503446400, 
1503532800, 1503619200, 1503878400, 1503964800, 1504051200, 1504137600, 
1504224000, 1504569600, 1504656000, 1504742400, 1504828800, 1505088000, 
1505174400, 1505260800, 1505347200, 1505433600, 1505692800, 1505779200, 
1505865600, 1505952000, 1506038400, 1506297600, 1506384000, 1506470400, 
1506556800, 1506643200, 1506902400, 1506988800, 1507075200, 1507161600, 
1507248000, 1507507200, 1507593600, 1507680000, 1507766400, 1507852800, 
1508112000, 1508198400, 1508284800, 1508371200, 1508457600, 1508716800, 
1508803200, 1508889600, 1508976000, 1509062400, 1509321600, 1509408000, 
1509494400, 1509580800, 1509667200, 1509926400, 1510012800, 1510099200, 
1510185600, 1510272000, 1510531200, 1510617600, 1510704000, 1510790400, 
1510876800, 1511136000, 1511222400, 1511308800, 1511481600, 1511740800, 
1511827200), tzone = "UTC", tclass = "Date"), .Dim = c(100L, 
8L), .Dimnames = list(NULL, c("open", "high", "low", "close", 
"volume", "adjusted", "direction", "returns")))

编辑:

这似乎可行:

train_date <- nrow(GOOG) *0.8
test_date <- train_date + 1
train <- GOOG[1:train_date,]
test <- GOOG[test_date:nrow(GOOG),]

max(index(train))
min(index(test))

输出:

> max(index(train))
[1] "2018-08-24"
> min(index(test))
[1] "2018-08-27"

但是,当我使用其他火车/测试%拆分运行时,火车数据在测试数据开始的同一日期结束。

train_date <- nrow(GOOG) *0.7
test_date <- train_date + 1
train <- GOOG[1:train_date,]
test <- GOOG[test_date:nrow(GOOG),]

max(index(train))
min(index(test))

输出:

> max(index(train))
[1] "2018-07-09"
> min(index(test))
[1] "2018-07-09"

1 个答案:

答案 0 :(得分:1)

如何使用select object, size, color, total_units from (select object, size, color, sum(units) as total_units, max(sum(units)) over (partition by object) as max_total_units, count(*) over (partition by object) as cnt from t group by object, size, color ) t where max_total_units >= 5 and cnt > 1; 并按rowindex进行拆分?不过,您可能需要先按日期对数据进行排序。

library(xts)