na.string不起作用

时间:2014-09-30 07:46:25

标签: r sas

structure(list(age = c(39L, 50L, 38L, 53L, 28L, 37L, 49L, 52L, 
31L, 42L, 37L, 30L, 23L, 32L, 40L), workclass = structure(c(8L, 
7L, 5L, 5L, 5L, 5L, 5L, 7L, 5L, 5L, 5L, 8L, 5L, 5L, 5L), .Label = c(" ?", 
" Federal-gov", " Local-gov", " Never-worked", " Private", " Self-emp-inc", 
" Self-emp-not-inc", " State-gov", " Without-pay"), class = "factor"), 
fnlwgt = c(77516L, 83311L, 215646L, 234721L, 338409L, 284582L, 
160187L, 209642L, 45781L, 159449L, 280464L, 141297L, 122272L, 
205019L, 121772L), education = structure(c(10L, 10L, 12L, 
2L, 10L, 13L, 7L, 12L, 13L, 10L, 16L, 10L, 10L, 8L, 9L), .Label = c(" 10th", 
" 11th", " 12th", " 1st-4th", " 5th-6th", " 7th-8th", " 9th", 
" Assoc-acdm", " Assoc-voc", " Bachelors", " Doctorate", 
" HS-grad", " Masters", " Preschool", " Prof-school", " Some-college"
), class = "factor"), `education-num` = c(13L, 13L, 9L, 7L, 
13L, 14L, 5L, 9L, 14L, 13L, 10L, 13L, 13L, 12L, 11L), `marital-status` = structure(c(5L, 
3L, 1L, 3L, 3L, 3L, 4L, 3L, 5L, 3L, 3L, 3L, 5L, 5L, 3L), .Label = c(" Divorced", 
" Married-AF-spouse", " Married-civ-spouse", " Married-spouse-absent", 
" Never-married", " Separated", " Widowed"), class = "factor"), 
occupation = structure(c(2L, 5L, 7L, 7L, 11L, 5L, 9L, 5L, 
11L, 5L, 5L, 11L, 2L, 13L, 4L), .Label = c(" ?", " Adm-clerical", 
" Armed-Forces", " Craft-repair", " Exec-managerial", " Farming-fishing", 
" Handlers-cleaners", " Machine-op-inspct", " Other-service", 
" Priv-house-serv", " Prof-specialty", " Protective-serv", 
" Sales", " Tech-support", " Transport-moving"), class = "factor"), 
relationship = structure(c(2L, 1L, 2L, 1L, 6L, 6L, 2L, 1L, 
2L, 1L, 1L, 1L, 4L, 2L, 1L), .Label = c(" Husband", " Not-in-family", 
" Other-relative", " Own-child", " Unmarried", " Wife"), class = "factor"), 
race = structure(c(5L, 5L, 5L, 3L, 3L, 5L, 3L, 5L, 5L, 5L, 
3L, 2L, 5L, 3L, 2L), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander", 
" Black", " Other", " White"), class = "factor"), sex = structure(c(2L, 
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L), .Label = c(" Female", 
" Male"), class = "factor"), `capital-gain` = c(2174L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 14084L, 5178L, 0L, 0L, 0L, 0L, 0L
), `capital-loss` = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L), `hours-per-week` = c(40L, 13L, 40L, 
40L, 40L, 40L, 16L, 45L, 50L, 40L, 80L, 40L, 30L, 50L, 40L
), `native-country` = structure(c(40L, 40L, 40L, 40L, 6L, 
40L, 24L, 40L, 40L, 40L, 40L, 20L, 40L, 40L, 1L), .Label = c(" ?", 
" Cambodia", " Canada", " China", " Columbia", " Cuba", " Dominican-Republic", 
" Ecuador", " El-Salvador", " England", " France", " Germany", 
" Greece", " Guatemala", " Haiti", " Holand-Netherlands", 
" Honduras", " Hong", " Hungary", " India", " Iran", " Ireland", 
" Italy", " Jamaica", " Japan", " Laos", " Mexico", " Nicaragua", 
" Outlying-US(Guam-USVI-etc)", " Peru", " Philippines", " Poland", 
" Portugal", " Puerto-Rico", " Scotland", " South", " Taiwan", 
" Thailand", " Trinadad&Tobago", " United-States", " Vietnam", 
" Yugoslavia"), class = "factor"), `NA` = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L), .Label = c(" <=50K", 
" >50K"), class = "factor")), .Names = c("age", "workclass", 
"fnlwgt", "education", "education-num", "marital-status", "occupation", 
"relationship", "race", "sex", "capital-gain", "capital-loss", 
"hours-per-week", "native-country", NA), row.names = c(NA, 15L), class = "data.frame")

在这些数据中有一些问号。 SAS或R并不认为它是缺失值。 所以我试着先把它作为缺失值或者用问号(?)擦除观察?

最好的办法是让它成为缺失值,但我不知道如何。 所以我尝试了下面的“na.string”。

adult<- read.table("adult.txt", sep= ",", header=F,na.strings="?" )

但这不起作用。请帮帮我。

1)如何识别R?作为一个缺少的角色?

2)或者我怎么能省略带问号的观察?

1 个答案:

答案 0 :(得分:0)

除了修复na.strings之外,您还可以将这些条目定义为NA

adult <- read.table("adult.txt", sep= ",", header=F)
adult[adult == " ?"] <- NA

这样做的好处是可以更灵活地处理,例如,如果缺少任何带问号的条目,

adult[grepl("?", adult, fixed=TRUE)] <- NA