structure(list(age = c(39L, 50L, 38L, 53L, 28L, 37L, 49L, 52L,
31L, 42L, 37L, 30L, 23L, 32L, 40L), workclass = structure(c(8L,
7L, 5L, 5L, 5L, 5L, 5L, 7L, 5L, 5L, 5L, 8L, 5L, 5L, 5L), .Label = c(" ?",
" Federal-gov", " Local-gov", " Never-worked", " Private", " Self-emp-inc",
" Self-emp-not-inc", " State-gov", " Without-pay"), class = "factor"),
fnlwgt = c(77516L, 83311L, 215646L, 234721L, 338409L, 284582L,
160187L, 209642L, 45781L, 159449L, 280464L, 141297L, 122272L,
205019L, 121772L), education = structure(c(10L, 10L, 12L,
2L, 10L, 13L, 7L, 12L, 13L, 10L, 16L, 10L, 10L, 8L, 9L), .Label = c(" 10th",
" 11th", " 12th", " 1st-4th", " 5th-6th", " 7th-8th", " 9th",
" Assoc-acdm", " Assoc-voc", " Bachelors", " Doctorate",
" HS-grad", " Masters", " Preschool", " Prof-school", " Some-college"
), class = "factor"), `education-num` = c(13L, 13L, 9L, 7L,
13L, 14L, 5L, 9L, 14L, 13L, 10L, 13L, 13L, 12L, 11L), `marital-status` = structure(c(5L,
3L, 1L, 3L, 3L, 3L, 4L, 3L, 5L, 3L, 3L, 3L, 5L, 5L, 3L), .Label = c(" Divorced",
" Married-AF-spouse", " Married-civ-spouse", " Married-spouse-absent",
" Never-married", " Separated", " Widowed"), class = "factor"),
occupation = structure(c(2L, 5L, 7L, 7L, 11L, 5L, 9L, 5L,
11L, 5L, 5L, 11L, 2L, 13L, 4L), .Label = c(" ?", " Adm-clerical",
" Armed-Forces", " Craft-repair", " Exec-managerial", " Farming-fishing",
" Handlers-cleaners", " Machine-op-inspct", " Other-service",
" Priv-house-serv", " Prof-specialty", " Protective-serv",
" Sales", " Tech-support", " Transport-moving"), class = "factor"),
relationship = structure(c(2L, 1L, 2L, 1L, 6L, 6L, 2L, 1L,
2L, 1L, 1L, 1L, 4L, 2L, 1L), .Label = c(" Husband", " Not-in-family",
" Other-relative", " Own-child", " Unmarried", " Wife"), class = "factor"),
race = structure(c(5L, 5L, 5L, 3L, 3L, 5L, 3L, 5L, 5L, 5L,
3L, 2L, 5L, 3L, 2L), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander",
" Black", " Other", " White"), class = "factor"), sex = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L), .Label = c(" Female",
" Male"), class = "factor"), `capital-gain` = c(2174L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 14084L, 5178L, 0L, 0L, 0L, 0L, 0L
), `capital-loss` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), `hours-per-week` = c(40L, 13L, 40L,
40L, 40L, 40L, 16L, 45L, 50L, 40L, 80L, 40L, 30L, 50L, 40L
), `native-country` = structure(c(40L, 40L, 40L, 40L, 6L,
40L, 24L, 40L, 40L, 40L, 40L, 20L, 40L, 40L, 1L), .Label = c(" ?",
" Cambodia", " Canada", " China", " Columbia", " Cuba", " Dominican-Republic",
" Ecuador", " El-Salvador", " England", " France", " Germany",
" Greece", " Guatemala", " Haiti", " Holand-Netherlands",
" Honduras", " Hong", " Hungary", " India", " Iran", " Ireland",
" Italy", " Jamaica", " Japan", " Laos", " Mexico", " Nicaragua",
" Outlying-US(Guam-USVI-etc)", " Peru", " Philippines", " Poland",
" Portugal", " Puerto-Rico", " Scotland", " South", " Taiwan",
" Thailand", " Trinadad&Tobago", " United-States", " Vietnam",
" Yugoslavia"), class = "factor"), `NA` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L), .Label = c(" <=50K",
" >50K"), class = "factor")), .Names = c("age", "workclass",
"fnlwgt", "education", "education-num", "marital-status", "occupation",
"relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country", NA), row.names = c(NA, 15L), class = "data.frame")
在这些数据中有一些问号。 SAS或R并不认为它是缺失值。 所以我试着先把它作为缺失值或者用问号(?)擦除观察?
最好的办法是让它成为缺失值,但我不知道如何。 所以我尝试了下面的“na.string”。
adult<- read.table("adult.txt", sep= ",", header=F,na.strings="?" )
但这不起作用。请帮帮我。
1)如何识别R?作为一个缺少的角色?
2)或者我怎么能省略带问号的观察?
答案 0 :(得分:0)
除了修复na.strings
之外,您还可以将这些条目定义为NA
。
adult <- read.table("adult.txt", sep= ",", header=F)
adult[adult == " ?"] <- NA
这样做的好处是可以更灵活地处理,例如,如果缺少任何带问号的条目,
adult[grepl("?", adult, fixed=TRUE)] <- NA