xj [i]中的错误:创建线性回归模型时下标类型“列表”无效

时间:2018-07-25 20:52:39

标签: r

我正在数据集“ movies_merged”上创建线性回归模型。运行以下代码时,出现一个名为 “ xj [i]中的错误:下标类型'列表'无效”

最后一行抛出错误 df_m1 = train_model_helper(df_numeric,train_rand)

但是在此函数内部,恰好在此行发生错误。 train_data_sample <-数据集[datarand,]

最近2天,我一直在这个问题上停留。请帮助我解决此问题。

Press Start.<strong>Warning:</strong>

1 个答案:

答案 0 :(得分:0)

1)源代码位于此处:Project 2: Modeling and Evaluation

2)在代码传输期间,数据清理部分可能会产生一些错字(请检查)。

3)上面提到的参考中的以下代码没有出现您提到的错误(movie_merged是从here下载的):

df = movies_merged
cat("Dataset has", dim(df)[1], "rows and", dim(df)[2], "columns", end="\n", file="")
colnames(df)

#install.packages("NLP",dependencies = TRUE)
#install.packages("robustHD",dependencies = TRUE)
#install.packages("lubridate",dependencies = TRUE)
#install.packages("http://cran.r-project.org/bin/windows/contrib/3.0/tm_0.5-10.zip",repos=NULL)
#install.packages("SnowballC")
#install.packages("tm")
#install.packages("proto")
#install.packages("GGally")
#install.packages("gsubfn")
#install.packages("reshape")
#install.packages("plyr")

library(ggplot2)
library(stringr)
library(robustHD)
library(lubridate)
library(NLP)
library(SnowballC)
library(tm)
library(proto)
library(GGally)
library(gsubfn)
library(reshape)
library(plyr)


df2 <- df[df$Type == "movie",]
dim(df2)
cat("Dataset with non-movie rows removed has", dim(df2)[1], "rows and", dim(df2)[2], "columns", end="\n", file="")

# TODO: Remove rows with missing Gross value
df2<-df2[!is.na(df2$Gross),]
dim(df2)
cat("Dataset after removing rows with missing Gross value has", dim(df2)[1], "rows and", dim(df2)[2], "columns", end="\n", file="")

# TODO: Exclude movies released prior to 2000
df2<-df2[df2$Year >= 2000,]
dim(df2)
cat("Dataset after excluding movies released prior to 2000 has", dim(df2)[1], "rows and", dim(df2)[2], "columns", end="\n", file="")

# TODO: Remove mismatched rows
#What if a movie was released in October to December of 2000 but had year listed as 2001? The mismatch removal is relaxed to have more data.
df_eliminate_rows<-df2
df_eliminate_rows$Released = as.Date(df_eliminate_rows$Released, "%Y-%m-%d")
df_eliminate_rows$ReleasedYear = as.numeric(format(df_eliminate_rows$Released, "%Y"))
match_df <- df_eliminate_rows[(df_eliminate_rows$Year == df_eliminate_rows$ReleasedYear)|((df_eliminate_rows$Year+1) == df_eliminate_rows$ReleasedYear),]
df2<-subset(match_df,select=-ReleasedYear)
cat("Dataset after removing mismatched rows has", dim(df2)[1], "rows and", dim(df2)[2], "columns", end="\n", file="")

# TODO: Exclude the `Domestic_Gross` column
df2<-subset(df2,select=-Domestic_Gross)
cat("Dataset after removing the 'Domestic' column has", dim(df2)[1], "rows and", dim(df2)[2], "columns", end="\n", file="")

# TODO: Replace df$Runtime with a numeric column containing the runtime in minutes
df2$Runtime <- as.character(df2$Runtime)

# Change each value on df2 to integer, of course after conversion to minutes.
n = length(df2$Runtime)
n_seq=seq(1, n)
for (i in n_seq)
{
  split_val = strsplit(df2$Runtime[i],"\\s+")[[1]]

  if (length(split_val) == 2)
  {
  if (split_val[2] == "min") {df2$Runtime[i] = suppressWarnings(as.integer(split_val[1]))}
  if (split_val[2] == "h")   {df2$Runtime[i] = (suppressWarnings(as.integer(split_val[1])) *
  60) + (suppressWarnings(as.integer(split_val[3])))}
  }

  if (length(split_val) == 4)
  {
  if ((split_val[2] == "h") &&  (split_val[4] == "min"))  
  {df2$Runtime[i] = (as.integer(split_val[1]) * 60) + (as.integer(split_val[3]))}
  }
}
df2$Runtime <- as.numeric(df2$Runtime)

#Replace NA values with the median
df2$Runtime[is.na(df2$Runtime)] <- median(df2$Runtime, na.rm=TRUE)


# TODO: Print the dimensions of the final preprocessed dataset and column names
cat("Final preprocessed dataset has", dim(df2)[1], "rows and", dim(df2)[2], "columns", end="\n", file="")
cat("Column Names in the final preprocessed dataset are shown below\n", colnames(df2), end="\n", file="")

# TODO: Build & evaluate model 1 (numeric variables only)

#Create a dataframe to store min Test and Training RMSE
rmse_df <- as.data.frame(matrix(0, ncol = 3, nrow = 5))
x <- c("Task", "test_RMSE", "train_RMSE")
colnames(rmse_df) <- x



#Ensure that all values in features that will be used in the linear regression model are numeric.
df2$Metascore<-as.numeric(as.character(df2$Metascore))
df2_ommitted_na <- na.omit(df2)
df_numeric<-sapply(df2_ommitted_na,is.numeric)
num_indic<-which(df_numeric)

#Data frame with all numeric values.
df_numeric<-df2_ommitted_na[,num_indic]

df_numeric<-subset(df_numeric,select=-tomatoRotten)
set.seed(101)

#Setting aside 80% of the dataset for training.
train_rand<-sample(1:nrow(df_numeric),size=0.8*nrow(df_numeric))

#Function to calculate the Root mean squared value for both training and test dataset.
rmseCalc <-function(df_train,df_test,ratio){
avg_train_rmse=c()
avg_test_rmse=c()
for(i in 1:10){
set.seed(125)

#Sample values to extract a ratio of the training data.
train_temp_rand<-sample(1:nrow(df_train),size=floor(ratio*nrow(df_train)))

#Partial Training data based on the ratio
temp_df<-df_train[train_temp_rand,]

#Get the linear model object
theta_mle=lm(Gross~.,temp_df)

#Predict the values on the test dataset.
predict_test = predict(theta_mle,df_test)
predict_test<-as.data.frame(predict_test)

#Get the rmse for training data and the test data.
avg_train_rmse=c(avg_train_rmse,sqrt(residuals(theta_mle)^2))
avg_test_rmse=c(avg_test_rmse,sqrt(mean((df_test$Gross - predict_test)^2)))

}
return_value=c()
return_value = c(return_value,mean(avg_train_rmse))
return_value =c(return_value,mean(avg_test_rmse))
return (return_value)
}

train_model_helper<-function(dataset,datarand){
  sampling_list = c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)
  train_rmse = c()
  test_rmse=c()


  train_data_sample<-dataset[datarand,]
  test_data_sample<-dataset[-datarand,]
  for (val in sampling_list) {
    retval_rmse=rmseCalc(train_data_sample,test_data_sample,val)
    print (retval_rmse)
    train_rmse = c(train_rmse,retval_rmse[1])
    test_rmse=c(test_rmse,retval_rmse[2])
    }
  df_rmse_m1<-data.frame(sampling_list,train_rmse,test_rmse)
  return( df_rmse_m1)
}




df_m1=train_model_helper(df_numeric,train_rand)