我想在训练和测试中划分数据。
我计算了百分位数,我想为数字变量选择唯一值
column <- c("NAME", "COUNTRY_CODE", "REGION_COUNTRY", "NACE_REV_2",
"CATEGORY_COMPANY", "TOTAL_ASSETS", "NET_CURRENT_ASSETS",
"FIXED_ASSETS", "NET_INCOME", "CAPITAL", "WORKING_CAPITAL",
"LOANS", "CURRENT_RATIO", "ROA", "EBIT")
data <- PL_ALL[, column]
nums <- sapply(data, is.numeric)
data.n <- data[, nums]
fact <- sapply(data, is.factor)
data.f <- data[, fact]
percentile <- apply(X=data.n, MARGIN=2, FUN=function(x)
round(quantile(x, seq(0.1, 1, 0.1), na.rm=TRUE), 2))
#There is ok
numeric <- colnames(data.n[which(unique >= 10)])
num_as_fact <- colnames(data.n[which(unique < 10 & unique > 1)])
行:
numeric<-colnames(data.n[which(unique>=10)])
我有一个问题:
Error in unique >= 10 :comparison (5) is possible only for atomic and list types
和相似之处:
num_as_fact<-colnames(data.n[which(unique<10 & unique>1 )])
我有问题:
Error in unique < 10 : comparison (3) is possible only for atomic and list types
我的数据:
dput(head(baza.n,15))
structure(list(TOTAL_ASSETS = c(8L, 11L, 11L, 15L, 16L, 17L,
17L, 18L, 21L, 22L, 25L, 28L, 28L, 29L, 32L), NET_CURRENT_ASSETS = c(-222L,
-275L, -1281L, -353L, -97L, -48L, -16L, -8L, -70L, -642L, -375L,
-236L, -50L, -476L, -1845L), FIXED_ASSETS = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 21L, 0L, 0L, 1L, 0L), NET_INCOME = c(-1L,
-1L, -2L, -1L, -9L, 0L, 0L, 0L, 0L, -8L, -1L, -1L, 5L, -23L,
-11L), CAPITAL = c(-210L, -274L, -1281L, 26L, 21L, -72L, -22L,
-11L, 51L, 129L, -393L, 0L, 3L, -507L, 192L), WORKING_CAPITAL = c(-168L,
-248L, -987L, -275L, -22L, -23L, -7L, -2L, -1L, -414L, -342L,
-123L, -55L, -478L, -660L), LOANS = c(30L, 0L, 106L, 30L, 8L,
25L, 0L, 0L, 0L, 89L, 1L, 83L, 5L, 0L, 671L), CURRENT_RATIO = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), ROA = c(-13L,
-9L, -14L, -3L, -56L, 1L, -1L, -1L, -1L, -37L, -5L, -4L, 16L,
-79L, -31L), EBIT = c(-1L, -1L, 2L, -1L, -9L, 0L, 0L, 0L, 0L,
-12L, -1L, -1L, -4L, -23L, -11L), WC.TA = c(-21, -22.55, -89.73,
-18.33, -1.38, -1.35, -0.41, -0.11, -0.05, -18.82, -13.68, -4.39,
-1.96, -16.48, -20.62), EBIT.TA = c(-0.12, -0.09, 0.18, -0.07,
-0.56, 0, 0, 0, 0, -0.55, -0.04, -0.04, -0.14, -0.79, -0.34)), row.names = c(NA,
15L), class = "data.frame")