我正在工作的是不平衡的,所以我试图平衡dataset
,为此,我尝试了各种技术,例如插入符号,mlr,ROSE,但出错了?
str(mydata)
Classes ‘data.table’ and 'data.frame': 40596053 obs. of 8 variables:
$ SessionID : chr "1" "1" "1" "1" ...
$ Timestamp.x: chr "2014-04-07T10:51:09.277Z" "2014-04-07T10:54:09.868Z" "2014-04-07T10:54:46.998Z" "2014-04-07T10:57:00.306Z" ...
$ ItemID.x : chr "214536502" "214536500" "214536506" "214577561" ...
$ Category : chr "0" "0" "0" "0" ...
$ Timestamp.y: chr NA NA NA NA ...
$ ItemID.y : chr "0" "0" "0" "0" ...
$ Price : chr NA NA NA NA ...
$ Quantity : chr NA NA NA NA ...
- attr(*, ".internal.selfref")=<externalptr>
- attr(*, "sorted")= chr "SessionID"
dput(head(mydata,20))
structure(list(SessionID = c("1", "1", "1", "1", "10000001",
"10000001", "10000001", "10000001", "10000001", "10000002", "10000002",
"10000002", "10000002", "10000003", "10000003", "10000003", "10000004",
"10000004", "10000004", "10000004"), Timestamp.x = c("2014-04-07T10:51:09.277Z",
"2014-04-07T10:54:09.868Z", "2014-04-07T10:54:46.998Z", "2014-04-07T10:57:00.306Z",
"2014-09-08T10:35:38.841Z", "2014-09-08T10:40:20.143Z", "2014-09-08T10:40:36.704Z",
"2014-09-08T10:41:12.386Z", "2014-09-08T10:48:34.245Z", "2014-09-08T19:10:51.206Z",
"2014-09-08T19:13:31.104Z", "2014-09-08T19:14:54.518Z", "2014-09-08T19:33:38.355Z",
"2014-09-05T11:32:15.524Z", "2014-09-05T11:34:25.159Z", "2014-09-05T11:37:23.321Z",
"2014-09-05T13:14:45.867Z", "2014-09-05T13:14:45.867Z", "2014-09-05T13:55:18.886Z",
"2014-09-05T13:55:18.886Z"), ItemID.x = c("214536502", "214536500",
"214536506", "214577561", "214854230", "214556216", "214556212",
"214854230", "214854125", "214849322", "214838094", "214714721",
"214853711", "214853090", "214851326", "214853094", "214853090",
"214853090", "214851326", "214851326"), Category = c("0", "0",
"0", "0", "S", "S", "S", "S", "S", "S", "S", "S", "S", "3", "3",
"3", "3", "3", "3", "3"), Timestamp.y = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "2014-09-05T14:07:33.845Z",
"2014-09-05T14:07:33.845Z", "2014-09-05T14:07:33.845Z", "2014-09-05T14:07:33.845Z"
), ItemID.y = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("0",
"1"), class = "factor"), Price = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, "4188", "1046", "4188", "1046"
), Quantity = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, "1", "1", "1", "1")), .Names = c("SessionID",
"Timestamp.x", "ItemID.x", "Category", "Timestamp.y", "ItemID.y",
"Price", "Quantity"), sorted = "SessionID", class = c("data.table",
"data.frame"), row.names = c(NA, -20L), .internal.selfref = <pointer: 0x0000000013950788>)
使用mlr
task = makeClassifTask(data = mydata, target = "ItemID.y")
Warning in makeTask(type = type, data = data, weights = weights, blocking = blocking, :
Provided data is not a pure data.frame but from class data.table, hence it will be converted.
Error in (function (cn, x) :
Unsupported feature type (character) in column 'SessionID'.
所以我将sessionID
的类型从char
更改为num
,但是Timestamp.x
却出现了相同的错误
Warning in makeTask(type = type, data = data, weights = weights, blocking = blocking, :
Provided data is not a pure data.frame but from class data.table, hence it will be converted.
Error in (function (cn, x) :
Unsupported feature type (chr) in column 'Timestamp.x'.
然后,我将Timestamp.x
的类型从char
更改为date
,但再次遇到相同的错误。
Warning in makeTask(type = type, data = data, weights = weights, blocking = blocking, :
Provided data is not a pure data.frame but from class data.table, hence it will be converted.
Error in (function (cn, x) :
Unsupported feature type (Date) in column 'Timestamp.x'.
我也尝试了插入符号和ROSE
library(caret)
> x <- matrix(mydata %>% select(-ItemID.y)
> y <- as.factor(mydata$ItemID.y)
> imbl_crt <- downSample(x, y, yname = "ItemID.y")
Error in `$<-.data.frame`(`*tmp*`, .outcome, value = c(1L, 1L, 1L, 1L, :
replacement has 40596053 rows, data has 7
> library(ROSE)
> data_balanced_over <- ovun.sample(mydata$ItemID.y ~ ., data = mydata, method = "over", N = 40596053) #balance dataset using ROSE
Error in (function (formula, data, method, subset, na.action, N, p = 0.5, :
The response variable has only one class.
我如何解决其中一个问题并平衡dataset
?
谢谢