我正在尝试根据存储在postgresql中的数据(4000万个观测值)估计有序的逻辑回归。 当我从内存中加载数据时,变量的格式配置正确,但是,当同一数据来自PostgreSQL时,格式不一致。因子类型变量的级别为0,但是从内存加载的数据的级别已正确配置。
我想知道如何正确设置因子水平为0的数据,从而能够从内存中获得相同的估计结果。如果查看内存clm的输出,它们与来自postgresql的数据的clm不同。
感谢您的帮助。我已经研究了很多,没有找到解决方法。
这是一个可以说明这种情况的小模型:
# Creating vectors
age <- c(40, 49, 48, 40, 67, 52, 53)
wage <- c(103200, 106200, 150200, 10606, 10390, 14070, 10220)
gender <- c("0", "0", "1","1", "0", "1", "0")
recidivism <- c("2","4","1","3","2","1","4")
# Creating data frame employees
employees<- data.frame(age, wage, gender, recidivism)
employees$recidivism<-as.factor(employees$recidivism)
levels(employees$recidivism)
#[1] "1" "2" "3" "4"
str(employees$recidivism)
#Factor w/ 4 levels "1","2","3","4": 2 4 1 3 2 1 4
employees$gender<-as.factor(employees$gender)
levels(employees$gender)
#[1] "0" "1"
str(employees$gender)
#Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1
employees$wage<-as.numeric(employees$wage)
employees$age<-as.numeric(employees$age)
#write.csv2(employees,"employees.csv", row.names = FALSE)
# Ordered logistic regression
library(MASS)
summary(polr(as.factor(recidivism)~wage+gender+age, data=employees, Hess = TRUE))
polr(recidivism~wage+gender+age, data=employees, Hess = TRUE)
# Call:
# polr(formula = recidivism ~ wage + gender + age, data = employees,
# Hess = TRUE)
#
# Coefficients:
# wage gender1 age
# -0.00003718695 -5.29212245194 -0.20601381165
#
# Intercepts:
# 1|2 2|3 3|4
# -15.94672 -13.54452 -12.70566
#
# Residual Deviance: 12.8695
# AIC: 24.8695
library(ordinal)
clm(recidivism~wage+gender+age, data=employees)
# formula: recidivism ~ wage + gender + age
# data: employees
#
# link threshold nobs logLik AIC niter max.grad cond.H
# logit flexible 7 -6.43 24.87 6(0) 5.00e-12 1.2e+12
#
# Coefficients:
# wage gender1 age
# -3.718e-05 -5.291e+00 -2.060e-01
#
# Threshold coefficients:
# 1|2 2|3 3|4
# -15.94 -13.54 -12.70
## Data from postgresql
if (!require('RPostgreSQL')) install.packages('RPostgreSQL', dependencies = TRUE); library('RPostgreSQL')
if (!require('getPass')) install.packages('getPass',dependencies = TRUE); library('getPass')
if (!require('dplyr')) install.packages('dplyr', dependencies = TRUE); library('dplyr')
# Conexao postgresql ----
# Configurando conexão com o PostgreSQL
con <- dbConnect(RPostgres::Postgres(),dbname = 'tese',
host = 'localhost',
port = 5432,
user = 'postgres',
password = rstudioapi::askForPassword("Enter your password"))
#Listando tabelas disponiveis no PostgreSQL
dbListTables(con)
#Listando colunas da tabela escolhida no PostgreSQL
dbListFields(con, "employees")
# Carregando dados do PostgreSQL ----
#Carregando tabela
employeesSQL=tbl(con, "employees") # pelo dbplyr (carrega como tibble, em lista)
employeesSQL$recidivism<-as.factor(employeesSQL$recidivism)
class(employeesSQL$recidivism)
#[1] "factor"
levels(employeesSQL$recidivism)
#character(0)
str(employeesSQL$recidivism)
#Factor w/ 0 levels:
employeesSQL$gender<-as.factor(employeesSQL$gender)
class(employeesSQL$gender)
#[1] "factor"
levels(employeesSQL$gender)
#character(0)
str(employeesSQL$gender)
#Factor w/ 0 levels:
employeesSQL$wage<-as.numeric(employeesSQL$wage)
employeesSQL$age<-as.numeric(employeesSQL$age)
#Ordered logistic regression PostgreSQL
library(MASS)
polr(as.factor(recidivism)~wage+as.factor(gender)+age, data=employeesSQL, Hess = TRUE)
# Warning message:
# In polr(as.factor(recidivism) ~ wage + gender + age, data = employeesSQL, :
# design appears to be rank-deficient, so dropping some coefs
polr(as.factor(employeesSQL$recidivism)~employeesSQL$wage+employeesSQL$gender+employeesSQL$age, Hess = TRUE)
# Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
# contrasts can be applied only to factors with 2 or more levels
library(ordinal)
clm(as.factor(recidivism)~wage+gender+age, data=employeesSQL)
# formula: as.factor(recidivism) ~ wage + gender + age
# data: employeesSQL
#
# link threshold nobs logLik AIC niter max.grad cond.H
# logit flexible 7 -0.00 18.00 19(0) 4.56e-09 4.5e+02
#
# Coefficients: (6 not defined because of singularities)
# wage103200 wage10390 wage10606 wage106200 wage14070 wage150200 gender"1"
# -8.129e+01 -8.129e+01 -4.177e+01 -7.273e-15 -1.231e+02 -1.231e+02 NA
# age48 age49 age52 age53 age67
# NA NA NA NA NA
#
# Threshold coefficients:
# "1"|"2" "2"|"3" "3"|"4"
# -101.07 -61.53 -21.99
sessionInfo()
R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19041)
Matrix products: default
locale:
[1] LC_COLLATE=Portuguese_Brazil.1252 LC_CTYPE=Portuguese_Brazil.1252 LC_MONETARY=Portuguese_Brazil.1252
[4] LC_NUMERIC=C LC_TIME=Portuguese_Brazil.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] dplyr_1.0.2 getPass_0.2-2 RPostgreSQL_0.6-2 DBI_1.1.0 ordinal_2019.12-10
[6] MASS_7.3-52 RPostgres_1.2.0
loaded via a namespace (and not attached):
[1] Rcpp_1.0.5 compiler_4.0.2 pillar_1.4.6 dbplyr_1.4.4 plyr_1.8.6
[6] tools_4.0.2 bit_4.0.4 lifecycle_0.2.0 tibble_3.0.3 lattice_0.20-41
[11] ucminf_1.1-4 pkgconfig_2.0.3 rlang_0.4.7 Matrix_1.2-18 cli_2.0.2
[16] rstudioapi_0.11 xfun_0.16 knitr_1.29 pROC_1.16.2 generics_0.0.2
[21] vctrs_0.3.3 hms_0.5.3 bit64_4.0.5 grid_4.0.2 tidyselect_1.1.0
[26] glue_1.4.1 R6_2.4.1 fansi_0.4.1 purrr_0.3.4.9000 blob_1.2.1
[31] magrittr_1.5 ellipsis_0.3.1 assertthat_0.2.1 numDeriv_2016.8-1.1 utf8_1.1.4
[36] crayon_1.3.4