因子w / 0级的有序逻辑回归(clm和polr)(postreSQL数据)

时间:2020-09-15 15:38:49

标签: postgresql ordinal logits

我正在尝试根据存储在postgresql中的数据(4000万个观测值)估计有序的逻辑回归。 当我从内存中加载数据时,变量的格式配置正确,但是,当同一数据来自PostgreSQL时,格式不一致。因子类型变量的级别为0,但是从内存加载的数据的级别已正确配置。

我想知道如何正确设置因子水平为0的数据,从而能够从内存中获得相同的估计结果。如果查看内存clm的输出,它们与来自postgresql的数据的clm不同。

感谢您的帮助。我已经研究了很多,没有找到解决方法。

这是一个可以说明这种情况的小模型:

# Creating vectors 
age <- c(40, 49, 48, 40, 67, 52, 53)   
wage <- c(103200, 106200, 150200, 10606, 10390, 14070, 10220) 
gender <- c("0", "0", "1","1", "0", "1", "0") 
recidivism <- c("2","4","1","3","2","1","4")
# Creating data frame employees
employees<- data.frame(age, wage, gender, recidivism)  

employees$recidivism<-as.factor(employees$recidivism)
levels(employees$recidivism)
#[1] "1" "2" "3" "4"
str(employees$recidivism)
#Factor w/ 4 levels "1","2","3","4": 2 4 1 3 2 1 4

employees$gender<-as.factor(employees$gender)
levels(employees$gender)
#[1] "0" "1"
str(employees$gender)
#Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1

employees$wage<-as.numeric(employees$wage)
employees$age<-as.numeric(employees$age)

#write.csv2(employees,"employees.csv", row.names = FALSE)

# Ordered logistic regression
library(MASS)
summary(polr(as.factor(recidivism)~wage+gender+age, data=employees, Hess = TRUE))
polr(recidivism~wage+gender+age, data=employees, Hess = TRUE)
# Call:
#   polr(formula = recidivism ~ wage + gender + age, data = employees, 
#        Hess = TRUE)
# 
# Coefficients:
#   wage        gender1          age 
# -0.00003718695 -5.29212245194 -0.20601381165 
# 
# Intercepts:
#   1|2       2|3       3|4 
# -15.94672 -13.54452 -12.70566 
# 
# Residual Deviance: 12.8695 
# AIC: 24.8695 


library(ordinal)
clm(recidivism~wage+gender+age, data=employees)
# formula: recidivism ~ wage + gender + age
# data:    employees
# 
# link  threshold nobs logLik AIC   niter max.grad cond.H 
# logit flexible  7    -6.43  24.87 6(0)  5.00e-12 1.2e+12
# 
# Coefficients:
#   wage    gender1      age 
# -3.718e-05 -5.291e+00 -2.060e-01 
# 
# Threshold coefficients:
#   1|2    2|3    3|4 
# -15.94 -13.54 -12.70 
## Data from postgresql
if (!require('RPostgreSQL')) install.packages('RPostgreSQL', dependencies = TRUE); library('RPostgreSQL')
if (!require('getPass')) install.packages('getPass',dependencies = TRUE); library('getPass')
if (!require('dplyr')) install.packages('dplyr', dependencies = TRUE); library('dplyr')


# Conexao postgresql ----
# Configurando conexão com o PostgreSQL
con <- dbConnect(RPostgres::Postgres(),dbname = 'tese', 
                 host = 'localhost', 
                 port = 5432, 
                 user = 'postgres',
                 password =  rstudioapi::askForPassword("Enter your password"))

#Listando tabelas disponiveis no PostgreSQL
dbListTables(con) 

#Listando colunas da tabela escolhida no PostgreSQL
dbListFields(con, "employees")

# Carregando dados do PostgreSQL ----

#Carregando tabela
employeesSQL=tbl(con, "employees") # pelo dbplyr (carrega como tibble, em lista)

employeesSQL$recidivism<-as.factor(employeesSQL$recidivism)
class(employeesSQL$recidivism)
#[1] "factor"
levels(employeesSQL$recidivism)
#character(0)
str(employeesSQL$recidivism)
#Factor w/ 0 levels:

employeesSQL$gender<-as.factor(employeesSQL$gender)
class(employeesSQL$gender)
#[1] "factor"
levels(employeesSQL$gender)
#character(0)
str(employeesSQL$gender)
#Factor w/ 0 levels: 

employeesSQL$wage<-as.numeric(employeesSQL$wage)
employeesSQL$age<-as.numeric(employeesSQL$age)


#Ordered logistic regression PostgreSQL
library(MASS)
polr(as.factor(recidivism)~wage+as.factor(gender)+age, data=employeesSQL, Hess = TRUE)
# Warning message:
#   In polr(as.factor(recidivism) ~ wage + gender + age, data = employeesSQL,  :
#             design appears to be rank-deficient, so dropping some coefs

polr(as.factor(employeesSQL$recidivism)~employeesSQL$wage+employeesSQL$gender+employeesSQL$age, Hess = TRUE)
# Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : 
#   contrasts can be applied only to factors with 2 or more levels

library(ordinal)
clm(as.factor(recidivism)~wage+gender+age, data=employeesSQL)
# formula: as.factor(recidivism) ~ wage + gender + age
# data:    employeesSQL
# 
# link  threshold nobs logLik AIC   niter max.grad cond.H 
# logit flexible  7    -0.00  18.00 19(0) 4.56e-09 4.5e+02
# 
# Coefficients: (6 not defined because of singularities)
# wage103200  wage10390  wage10606 wage106200  wage14070 wage150200     gender"1" 
# -8.129e+01    -8.129e+01    -4.177e+01    -7.273e-15    -1.231e+02    -1.231e+02            NA 
# age48       age49       age52       age53       age67 
# NA            NA            NA            NA            NA 
# 
# Threshold coefficients:
#   "1"|"2" "2"|"3" "3"|"4" 
# -101.07  -61.53  -21.99 

sessionInfo()

R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19041)

Matrix products: default

locale:
[1] LC_COLLATE=Portuguese_Brazil.1252  LC_CTYPE=Portuguese_Brazil.1252    LC_MONETARY=Portuguese_Brazil.1252
[4] LC_NUMERIC=C                       LC_TIME=Portuguese_Brazil.1252    

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.0.2        getPass_0.2-2      RPostgreSQL_0.6-2  DBI_1.1.0          ordinal_2019.12-10
[6] MASS_7.3-52        RPostgres_1.2.0   

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.5          compiler_4.0.2      pillar_1.4.6        dbplyr_1.4.4        plyr_1.8.6         
 [6] tools_4.0.2         bit_4.0.4           lifecycle_0.2.0     tibble_3.0.3        lattice_0.20-41    
[11] ucminf_1.1-4        pkgconfig_2.0.3     rlang_0.4.7         Matrix_1.2-18       cli_2.0.2          
[16] rstudioapi_0.11     xfun_0.16           knitr_1.29          pROC_1.16.2         generics_0.0.2     
[21] vctrs_0.3.3         hms_0.5.3           bit64_4.0.5         grid_4.0.2          tidyselect_1.1.0   
[26] glue_1.4.1          R6_2.4.1            fansi_0.4.1         purrr_0.3.4.9000    blob_1.2.1         
[31] magrittr_1.5        ellipsis_0.3.1      assertthat_0.2.1    numDeriv_2016.8-1.1 utf8_1.1.4         
[36] crayon_1.3.4       

0 个答案:

没有答案