我被分配了制作预测模型的任务。给我的数据集纯粹是绝对的,由92个变量组成。其中一部分如下:
Dataset <- structure(list(Age.Group = structure(c(1L, 2L, 3L, 3L, 4L, 4L,
4L, 1L, 4L, 4L, 2L, 1L, 2L, 5L, 3L, 2L, 1L, 4L, 1L, 4L, 4L, 3L,
4L, 2L, 2L, 1L, 4L, 2L, 3L, 2L, 4L, 4L, 3L, 3L, 3L, 3L, 5L, 3L,
2L, 2L, 2L, 2L, 4L, 2L, 3L, 4L, 3L, 3L, 1L, 4L), .Label = c("1",
"2", "3", "4", "5"), class = "factor"), Sex = structure(c(2L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
1L), .Label = c("Female", "Male"), class = "factor"), LOS = structure(c(2L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L), .Label = c("Abnormal", "Normal"), class = "factor"), Day.to.Operation = structure(c(1L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L,
1L), .Label = c("Abnormal", "Normal"), class = "factor"), Admit.Source = structure(c(2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Emergency", "Outpatient clinic"), class = "factor"),
Insurance.Payors = structure(c(3L, 1L, 3L, 3L, 1L, 1L, 1L,
3L, 1L, 3L, 1L, 3L, 1L, 1L, 5L, 1L, 1L, 2L, 1L, 5L, 1L, 5L,
1L, 3L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 5L, 1L, 1L, 1L, 5L, 5L,
1L, 1L, 1L, 1L, 1L, 3L, 5L, 1L, 1L, 1L, 1L, 3L, 4L), .Label = c("Basic medical insurance for urban residents",
"Basic medical insurance for urban residents Others", "Free Medical Care",
"New Rural Cooperative Medical Care", "Self payment"), class = "factor"),
Current.Recent.Smoker...1.year. = structure(c(1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 2L), .Label = c("No", "Yes"), class = "factor"), Hypertension = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
Dyslipidemia = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), .Label = c("No",
"Yes"), class = "factor"), Family.History.of.Premature.CAD = structure(c(2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L), .Label = c("No", "Yes"), class = "factor"),
MI.History = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("No",
"Yes"), class = "factor"), Heart.Failure.History = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
PCI.History = structure(c(2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), BMI.Group = structure(c(3L, 2L,
3L, 2L, 3L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 3L,
3L, 3L, 3L, 4L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 2L, 3L, 3L, 3L, 2L, 3L, 2L, 3L,
3L, 4L, 2L), .Label = c("2", "3", "4", "5"), class = "factor"),
Cerebrovascular.Disease = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("No", "Yes"), class = "factor"), Peripheral.Arterial.Disease = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Chronic.Lung.Disease = structure(c(1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), Diabetes.Mellitus = structure(c(2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 2L, 1L), .Label = c("No", "Yes"), class = "factor"),
Diabetes.Therapy = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 3L, 4L, 2L, 4L, 4L, 1L, 2L, 4L, 4L, 4L, 2L, 2L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 4L,
2L, 4L, 4L, 4L, 4L, 2L, 4L, 2L, 4L, 4L, 4L, 4L, 2L), .Label = c("Diet",
"Insulin", "N/A", "Oral"), class = "factor"), Heart.Rate = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 2L, 2L, 2L), .Label = c("Abnormal", "Normal"), class = "factor"),
CAD.Presentation = structure(c(3L, 5L, 5L, 4L, 5L, 5L, 4L,
1L, 5L, 5L, 5L, 5L, 4L, 4L, 5L, 1L, 5L, 5L, 5L, 3L, 5L, 5L,
5L, 1L, 5L, 5L, 5L, 5L, 5L, 3L, 4L, 1L, 5L, 5L, 5L, 5L, 3L,
5L, 4L, 3L, 5L, 4L, 5L, 5L, 2L, 5L, 5L, 3L, 1L, 1L), .Label = c("Non STEMI 7 days",
"Silent myocardial ischemia 14 days", "Stable angina 42 days",
"STEMI 7 days", "Unstable angina 60 days"), class = "factor"),
STEMI.Non.STEMI.Onset.Date = structure(c(1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L), .Label = c("0", "1", "17"), class = "factor"), STEMI.Non.STEMI.Estimated.Time = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor"),
Anginal.Classification.w.in.2.Weeks = structure(c(2L, 4L,
3L, 5L, 1L, 5L, 4L, 1L, 5L, 4L, 5L, 2L, 2L, 3L, 1L, 1L, 2L,
5L, 5L, 3L, 2L, 5L, 2L, 2L, 2L, 4L, 1L, 2L, 3L, 5L, 2L, 4L,
3L, 5L, 4L, 4L, 5L, 2L, 1L, 3L, 2L, 1L, 3L, 1L, 5L, 2L, 3L,
2L, 1L, 2L), .Label = c("CCS I", "CCS II", "CCS III", "CCS IV",
"No symptoms"), class = "factor"), Anti.Anginal.Drug.Therapy.within.2.Weeks = structure(c(2L,
1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
1L, 2L, 2L, 2L), .Label = c("No", "Yes"), class = "factor")), .Names = c("Age.Group",
"Sex", "LOS", "Day.to.Operation", "Admit.Source", "Insurance.Payors",
"Current.Recent.Smoker...1.year.", "Hypertension", "Dyslipidemia",
"Family.History.of.Premature.CAD", "MI.History", "Heart.Failure.History",
"PCI.History", "BMI.Group", "Cerebrovascular.Disease", "Peripheral.Arterial.Disease",
"Chronic.Lung.Disease", "Diabetes.Mellitus", "Diabetes.Therapy",
"Heart.Rate", "CAD.Presentation", "STEMI.Non.STEMI.Onset.Date",
"STEMI.Non.STEMI.Estimated.Time", "Anginal.Classification.w.in.2.Weeks",
"Anti.Anginal.Drug.Therapy.within.2.Weeks"), class = "data.frame", row.names = c(NA,
-50L))
到目前为止,我已经执行了字符串清理和缺少数据处理。我在下一个任务中需要帮助,即删除异常值并从此分类数据集中计算卡方矩阵。我是数据分析的新手,在这一点上我很困惑。如果我能得到这方面的帮助,我将非常感激。