让我们考虑一下这个数据集:
df <- data.frame(age= sample(c(20:90), 20, rep=T),
sex = sample(c('m', 'f'), 20, rep=T),
smoker=sample(c("never", "former", "active"), 20, rep=T),
size= sample (c(8:40), 20, rep=T),
fac = as.factor(sample(c("neg","lo","med","hi"), 20, rep=T)),
outcome = sample(c(0,1), 20, rep=T)
)
# let's introduce some missing data
for (i in (1:3)) {df[sample(c(1:20),1), sample(c(1:6),1)] <- NA}
在医学手稿中,第一个表格总结了人口(或其适当的亚组);这里的行将是年龄,性别,吸烟状况等,这两个结果将列在单独的列中。连续变量报告为平均值;分类变量为计数。
问太多了?!
答案 0 :(得分:0)
在医学文章中,“表1”总结了研究人群的人口统计特征,通常在亚组之间细分
n <- 100 df <- data.frame( age = sample(c(20:90), n, rep = T), sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)), smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)), size = abs(rnorm(n, 20, 8)), logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)), labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))), outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2)) ) # let's introduce some missing data for (i in (1:floor(n/6))) { df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA } head(df)
## age sex smoker size logitest labtest outcome
## 1 70 m former 39.17 NA med NA
## 2 51 f former 33.64 FALSE hi 1
## 3 58 f former 10.10 FALSE neg 1
## 4 30 m former 43.24 FALSE med 0
## 5 54 m former 22.78 FALSE lo 0
## 6 86 f former 8.20 FALSE neg 0
如果使用真实数据集,请改用它
# df <- read.csv() #you may need to eliminate some columns #colnames(df) #df0<-df #backup #df <- df[,-c(1,...,27:38)]
dx <- 7 #index of outcome/diagnosis #################################### summary(df[, -dx])
## age sex smoker size logitest
## Min. :20.0 f :44 active:19 Min. : 0.91 Mode :logical
## 1st Qu.:42.5 m :54 former:49 1st Qu.:15.00 FALSE:85
## Median :58.0 NA's: 2 never :30 Median :20.12 TRUE :12
## Mean :57.3 NA's : 2 Mean :20.44 NA's :3
## 3rd Qu.:74.0 3rd Qu.:27.10
## Max. :88.0 Max. :43.24
## NA's :1 NA's :2
## labtest
## hi : 4
## lo :29
## med :20
## neg :45
## NA's: 2
##
##
attach(df)
vars <- colnames(df) vars
## [1] "age" "sex" "smoker" "size" "logitest" "labtest"
## [7] "outcome"
catvars <- NULL #categorical variables contvars <- NULL #continuous variables logivars <- NULL #logic variables vars <- vars[-dx] vars
## [1] "age" "sex" "smoker" "size" "logitest" "labtest"
for (i in 1:length(vars)) { ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[, i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i]))) } contvars
## [1] "age" "size"
catvars
## [1] "sex" "smoker" "labtest"
logivars
## [1] "logitest"
bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ] nrow(bg) #; bg
## [1] 73
mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ] nrow(mg) #; mg
## [1] 23
indet <- df[is.na(df[, dx]), ] nrow(indet)
## [1] 4
indet
## age sex smoker size logitest labtest outcome
## 1 70 m former 39.173 NA med NA
## 9 87 m former 23.621 FALSE lo NA
## 18 65 m former 2.466 FALSE <NA> NA
## 67 88 f former 17.575 FALSE med NA
normality <- NULL for (i in 1:length(contvars)) { j <- which(vars == contvars[i]) #find position of variable in the original data frame and its subsets st <- shapiro.test(df[, j]) #normality testing on all patients, bg and mg alike normality <- c(normality, st$p.value) #normality testing on all patients, bg and mg alike } normality
## [1] 0.00125 0.73602
比较两个样本的平均值;如果正常,使用t检验,否则使用wilcoxon
ttpvalue <- NULL for (i in 1:length(contvars)) { j <- which(vars == contvars[i]) #find position of variable in the original data frame and its subsets ## if normal, use t-test, otherwise wilcoxon if shapiro p<.05 then pop ## likely NOT normally dist ifelse(normality[i] < 0.05, tt <- wilcox.test(bg[, j], mg[, j]), tt <- t.test(bg[, j], mg[, j])) ttpvalue <- c(ttpvalue, tt$p.value) ##if t-test p<.05 then pop likely have different means } ttpvalue
## [1] 0.6358 0.3673
contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)
chisqpvalue <- NULL for (i in 1:length(catvars)) { j <- which(vars == catvars[i]) #find position of variable in the original data frame and its subsets tbl <- table(df[, j], df[, dx]) chisqtest <- summary(tbl) chisqpvalue <- c(chisqpvalue, chisqtest$p.value) } chisqpvalue
## [1] 0.01579 0.77116 0.39484
catvarlist <- list(variables = catvars, chisq.by.subgroup = chisqpvalue)
proppvalue <- NULL for (i in 1:length(logivars)) { j <- which(vars == logivars[i]) #find position of variable in the original data frame and its subsets tbl <- table(df[, j], df[, dx]) chisqtest <- summary(tbl) proppvalue <- c(proppvalue, chisqtest$p.value) } proppvalue
## [1] 0.5551
logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)
str(contvarlist) #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means
## List of 3
## $ variables : chr [1:2] "age" "size"
## $ normality : num [1:2] 0.00125 0.73602
## $ ttest.by.subgroup: num [1:2] 0.636 0.367
str(catvarlist) #if chisq p<.05 then variables are likely NOT independent
## List of 2
## $ variables : chr [1:3] "sex" "smoker" "labtest"
## $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948
str(logivarlist) #if chisq p<.05 then variables are likely NOT independent
## List of 2
## $ variables : chr "logitest"
## $ chisq.by.subgroup: num 0.555