是否可以通过ID

时间:2019-07-14 19:30:18

标签: sql r sql-server one-hot-encoding

可以说我有以下数据:

quote_id = c(123,123,123,123,789,789,789,789,456,456,456,456)
Cake  = c( 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' )
Egg = c(1,0,0,0,0,0,0,0,0,0,1,0)
Regular_Milk = c(0,0,1,0,0,0,0,0,0,0,0,0)
Almond_Milk = c(0,0,0,0,1,0,0,0,0,0,0,0)
Coconut_Milk  = c(0,0,0,0,0,0,0,0,1,0,0,0)
Regular_Sugar = c(0,0,0,0,0,0,0,0,0,1,0,0)
Cane_Sugar = c(0,1,0,0,0,0,1,0,0,0,0,0)
Regular_Flour = c(0,0,0,1,0,0,0,0,0,0,0,0)
Oat_Flour = c(0,0,0,0,0,0,0,0,0,0,0,0)
Wheat_Flour = c(0,0,0,0,0,0,0,0,0,0,0,1)
Almond_Flour = c(0,0,0,0,0,0,0,1,0,0,0,0)

Old_Cake_Data = data.frame(quote_id, Cake , Egg, Regular_Milk, Almond_Milk, Coconut_Milk , Regular_Sugar, Cane_Sugar, Regular_Flour, Oat_Flour, Wheat_Flour, Almond_Flour)

在SQL或R中是否可以获取以下输出:

quote_id = c(123,789,456)
Cake  = c( 'chocolate' , 'chocolate' ,'chocolate' )
Egg = c(1,0,1)
Regular_Milk = c(1,0,0)
Almond_Milk = c(0,1,0)
Coconut_Milk  = c(0,0,1)
Regular_Sugar = c(0,0,1)
Cane_Sugar = c(1,1,0)
Regular_Flour = c(1,0,0)
Oat_Flour = c(0,0,0)
Wheat_Flour = c(0,0,1)
Almond_Flour = c(0,1,0)

New_Cake_Data = data.frame(quote_id, Cake , Egg, Regular_Milk, Almond_Milk, Coconut_Milk , Regular_Sugar, Cane_Sugar, Regular_Flour, Oat_Flour, Wheat_Flour, Almond_Flour)

我考虑过对每一列求和,但是问题是我需要按quote_id对输出进行排序。

本质上,我希望每个项目ID的输出为一条记录。

1 个答案:

答案 0 :(得分:0)

在R中,您可以尝试以下操作:

quote_id = c(123,123,123,123,789,789,789,789,456,456,456,456)
Cake  = c( 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' , 'chocolate' )
Egg = c(1,0,0,0,0,0,0,0,0,0,1,0)
Regular_Milk = c(0,0,1,0,0,0,0,0,0,0,0,0)
Almond_Milk = c(0,0,0,0,1,0,0,0,0,0,0,0)
Coconut_Milk  = c(0,0,0,0,0,0,0,0,1,0,0,0)
Regular_Sugar = c(0,0,0,0,0,0,0,0,0,1,0,0)
Cane_Sugar = c(0,1,0,0,0,0,1,0,0,0,0,0)
Regular_Flour = c(0,0,0,1,0,0,0,0,0,0,0,0)
Oat_Flour = c(0,0,0,0,0,0,0,0,0,0,0,0)
Wheat_Flour = c(0,0,0,0,0,0,0,0,0,0,0,1)
Almond_Flour = c(0,0,0,0,0,0,0,1,0,0,0,0)

Old_Cake_Data = data.frame(quote_id, Cake , Egg, Regular_Milk, Almond_Milk, Coconut_Milk , Regular_Sugar, Cane_Sugar, Regular_Flour, Oat_Flour, Wheat_Flour, Almond_Flour)

# find quote_id's levels
lev<-levels(as.factor(Old_Cake_Data$quote_id))

# create a dataframe
New_Cake_Data <- Old_Cake_Data[1:length(lev),]
New_Cake_Data$quote_id<-lev

for( i in 1:length(lev)){
  d<-which(Old_Cake_Data$quote_id==lev[i])
  New_Cake_Data$Cake[i]<-Old_Cake_Data$Cake[d][1]
  New_Cake_Data$Egg [i]<-sum(Old_Cake_Data$Egg[d])
  New_Cake_Data$Regular_Milk [i]<-sum(Old_Cake_Data$Regular_Milk[d])
  New_Cake_Data$Almond_Milk [i]<-sum(Old_Cake_Data$Almond_Milk[d])
  New_Cake_Data$Coconut_Milk[i]<-sum(Old_Cake_Data$Coconut_Milk[d])
  New_Cake_Data$Regular_Sugar[i]<-sum(Old_Cake_Data$Regular_Sugar[d])
  New_Cake_Data$Cane_Sugar [i]<-sum(Old_Cake_Data$Cane_Sugar[d])
  New_Cake_Data$Regular_Flour[i]<-sum(Old_Cake_Data$Regular_Flour[d])
  New_Cake_Data$Oat_Flour [i]<-sum(Old_Cake_Data$Oat_Flour[d])
  New_Cake_Data$Wheat_Flour [i]<-sum(Old_Cake_Data$Wheat_Flour[d])
  New_Cake_Data$Almond_Flour [i]<-sum(Old_Cake_Data$Almond_Flour[d])
}


### final data
print(New_Cake_Data)

如果您不知道列数和种类(数字还是字符)

# create a dataframe
New_Cake_Data <- Old_Cake_Data[1:length(lev),]
New_Cake_Data$quote_id<-lev

for( i in 1:length(lev)){
  d<-which(Old_Cake_Data$quote_id==lev[i])
  k<-i
  for(t in 2:ncol(New_Cake_Data)){
    if(is.numeric(Old_Cake_Data[,t])){
      New_Cake_Data[k,t]<-sum(Old_Cake_Data[d,t])
    }else{
      New_Cake_Data[k,t]<-Old_Cake_Data[d[1],t]
    }
  }
}