我有550 242个观测数据和9个变量的数据集
str(train)
'data.frame': 550242 obs. of 9 variables:
$ State.Name : chr "ANDHRA PRADESH" "ANDHRA PRADESH" "ANDHRA PRADESH" "ANDHRA PRADESH" ...
$ District.Name : chr "EAST GODAVARI(04)" "EAST GODAVARI(04)" "EAST GODAVARI(04)" "EAST GODAVARI(04)" ...
$ Block.Name : chr "PRATHIPADU(10)" "PRATHIPADU(10)" "PRATHIPADU(10)" "PRATHIPADU(10)" ...
$ Panchayat.Name : chr "GOKAVARAM(04)" "GOKAVARAM(04)" "GAJJANAPUDI(06)" "GAJJANAPUDI(06)" ...
$ Village.Name : chr "VANTHADA(014 )" "PANDAVULAPALEM(022 )" "G. KOTHURU(023 )" "GAJJANAPUDI(029 )" ...
$ Habitation.Name : chr "VANTHADA(0404410014010400)" "PANDAVULAPALEM(0404410022010400)" "G. KOTHURU(0404410023010600)" "GAJJANAPUDI(0404410029010600)" ...
$ Quality.Parameter: chr "Salinity" "Fluoride" "Salinity" "Salinity" ...
$ Year : chr "1/4/2009" "1/4/2009" "1/4/2009" "1/4/2009" ...
$ newdate : Date, format: "2009-04-01" "2009-04-01" "2009-04-01" ...
head(unique(train$District.Name))
[1] "EAST GODAVARI(04)" "WEST GODAVARI(05)" "KRISHNA(06)" "GUNTUR(07)" "ADILABAD(19)"
[6] "KARIMNAGAR(20)"
在列名train$District.Name
中,我只想保留字符串并清理其余部分,所以这是我的代码。:
state_1$District.Name <- gsub("("," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub(")"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("1"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("0"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("29"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("16"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("3"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("5"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("14"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("24"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("22"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("25"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("21"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("20"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("9"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub(")"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("1"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("0"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("29"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("16"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("3"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("5"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("8"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("14"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("24"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("22"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("25"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("2"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("6"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("4"," ",fixed=TRUE,state_1$District.Name)
state_1$District.Name <- gsub("7"," ",fixed=TRUE,state_1$District.Name)
由于存在所有这些字符,但我可以使用循环执行相同的操作(更少的代码来处理):
vector<-c(" `(",")","1","0","29","8","16","3","5","8","14","21","22","23","24","25","2","6","4","7","9","14")`
for (i in 1:length(state_1$District.Name)) {
for(j in 1:length(vector))
{
train$District.Name <- gsub(vector[j],new.vector[j],fixed=TRUE,train$District.Name)
}
}
此代码完成了工作,但需要花费太多时间。 作为顶级代码,它可以在几秒钟内完成所有500k变量(但更多行代码)的工作。
我可以用更少的代码和更快的执行来获得大量观察的两全其美吗?
答案 0 :(得分:3)
如果我清楚地理解,对于state_1 $ District.Name你只想保留字符串。您可以使用reg exp:state_1$District.Name <- gsub(pattern = "\\(.*","",state_1$District.Name)