现在，结果！

Question

让我们考虑一下这个数据集：

df <- data.frame(age=   sample(c(20:90), 20, rep=T), 
             sex =  sample(c('m', 'f'), 20, rep=T),
             smoker=sample(c("never", "former", "active"), 20, rep=T),
             size=  sample (c(8:40), 20, rep=T),
             fac =  as.factor(sample(c("neg","lo","med","hi"), 20, rep=T)),
             outcome = sample(c(0,1), 20, rep=T)
             )
# let's introduce some missing data         
for (i in (1:3)) {df[sample(c(1:20),1),  sample(c(1:6),1)]  <- NA}

在医学手稿中，第一个表格总结了人口（或其适当的亚组）;这里的行将是年龄，性别，吸烟状况等，这两个结果将列在单独的列中。连续变量报告为平均值;分类变量为计数。

我想知道是否有一个我失踪的功能创建这样的列联表。我可以手动完成，但是如果数据集发生变化，我希望能够自动更新。最终我需要用乳胶输出。
该函数需要忽略缺失的数据，但不能删除这些行。

问太多了？！

Answer 1

在医学文章中，“表1”总结了研究人群的人口统计特征，通常在亚组之间细分

生成数据集

n <- 100
df <- data.frame(
age = sample(c(20:90), n, rep = T), 
sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)), 
smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)), 
size = abs(rnorm(n, 20, 8)), 
logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)), 
labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))), 
outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2))
)

# let's introduce some missing data
for (i in (1:floor(n/6))) {
    df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA
}
head(df)

##   age sex smoker  size logitest labtest outcome
## 1  70   m former 39.17       NA     med      NA
## 2  51   f former 33.64    FALSE      hi       1
## 3  58   f former 10.10    FALSE     neg       1
## 4  30   m former 43.24    FALSE     med       0
## 5  54   m former 22.78    FALSE      lo       0
## 6  86   f former  8.20    FALSE     neg       0

如果使用真实数据集，请改用它

# df <- read.csv() 
#you may need to eliminate some columns    
#colnames(df) 
#df0<-df #backup 
#df <- df[,-c(1,...,27:38)]

根据需要更改：必须从变量列表中删除具有诊断的列！

dx <- 7  #index of outcome/diagnosis
####################################
summary(df[, -dx])

##       age         sex        smoker        size        logitest      
##  Min.   :20.0   f   :44   active:19   Min.   : 0.91   Mode :logical  
##  1st Qu.:42.5   m   :54   former:49   1st Qu.:15.00   FALSE:85       
##  Median :58.0   NA's: 2   never :30   Median :20.12   TRUE :12       
##  Mean   :57.3             NA's  : 2   Mean   :20.44   NA's :3        
##  3rd Qu.:74.0                         3rd Qu.:27.10                  
##  Max.   :88.0                         Max.   :43.24                  
##  NA's   :1                            NA's   :2                      
##  labtest  
##  hi  : 4  
##  lo  :29  
##  med :20  
##  neg :45  
##  NA's: 2  
##           
##

attach(df)

构建变量列表

vars <- colnames(df)
vars

## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest" 
## [7] "outcome"

catvars <- NULL  #categorical variables
contvars <- NULL  #continuous variables
logivars <- NULL  #logic variables

vars <- vars[-dx]
vars

## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest"

for (i in 1:length(vars)) {
    ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[, 
        i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i])))
}
contvars

## [1] "age"  "size"

catvars

## [1] "sex"     "smoker"  "labtest"

logivars

## [1] "logitest"

创建子组

bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ]
nrow(bg)  #; bg

## [1] 73

mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ]
nrow(mg)  #; mg

## [1] 23

indet <- df[is.na(df[, dx]), ]
nrow(indet)

## [1] 4

indet

##    age sex smoker   size logitest labtest outcome
## 1   70   m former 39.173       NA     med      NA
## 9   87   m former 23.621    FALSE      lo      NA
## 18  65   m former  2.466    FALSE    <NA>      NA
## 67  88   f former 17.575    FALSE     med      NA

对于连续变量

正态性测试

normality <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    st <- shapiro.test(df[, j])  #normality testing on all patients, bg and mg alike
    normality <- c(normality, st$p.value)  #normality testing on all patients, bg and mg alike
}
normality

## [1] 0.00125 0.73602

比较两个样本的平均值;如果正常，使用t检验，否则使用wilcoxon

ttpvalue <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    ## if normal, use t-test, otherwise wilcoxon if shapiro p<.05 then pop
    ## likely NOT normally dist
    ifelse(normality[i] < 0.05, tt <- wilcox.test(bg[, j], mg[, j]), tt <- t.test(bg[, 
        j], mg[, j]))
    ttpvalue <- c(ttpvalue, tt$p.value)  ##if t-test p<.05 then pop likely have different means
}
ttpvalue

## [1] 0.6358 0.3673

contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)

对于分类变量

chisqpvalue <- NULL
for (i in 1:length(catvars)) {
    j <- which(vars == catvars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    chisqpvalue <- c(chisqpvalue, chisqtest$p.value)
}
chisqpvalue

## [1] 0.01579 0.77116 0.39484

catvarlist <- list(variables = catvars, chisq.by.subgroup = chisqpvalue)

对于逻辑变量

proppvalue <- NULL
for (i in 1:length(logivars)) {
    j <- which(vars == logivars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    proppvalue <- c(proppvalue, chisqtest$p.value)
}
proppvalue

## [1] 0.5551

logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)

现在，结果！

str(contvarlist)  #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means

## List of 3
##  $ variables        : chr [1:2] "age" "size"
##  $ normality        : num [1:2] 0.00125 0.73602
##  $ ttest.by.subgroup: num [1:2] 0.636 0.367

str(catvarlist)  #if chisq p<.05 then variables are likely NOT independent

## List of 2
##  $ variables        : chr [1:3] "sex" "smoker" "labtest"
##  $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948

str(logivarlist)  #if chisq p<.05 then variables are likely NOT independent

## List of 2
##  $ variables        : chr "logitest"
##  $ chisq.by.subgroup: num 0.555

R中的多重列联表

1 个答案:

生成数据集

根据需要更改：必须从变量列表中删除具有诊断的列！

构建变量列表

创建子组

对于连续变量

对于分类变量

对于逻辑变量

现在，结果！