考虑以下代码:
# Load libraries
library(RCurl)
library(TraMineR)
library(PST)
# Get data
x <- getURL("https://gist.githubusercontent.com/aronlindberg/08228977353bf6dc2edb3ec121f54a29/raw/c2539d06771317c5f4c8d3a2052a73fc485a09c6/challenge_level.csv")
data <- read.csv(text = x)
# Load and transform data
data <- read.table("thread_level.csv", sep = ",", header = F, stringsAsFactors = F)
# Create sequence object
data.seq <- seqdef(data[2:nrow(data),2:ncol(data)], missing = NA, right= NA, nr = "*")
# Make a tree
S1 <- pstree(data.seq, ymin = 0.05, L = 6, lik = FALSE, with.missing = TRUE)
# Look at contexts
cmine(S1, pmin = 0, state = "N3", l = 3)
然后,我可以计算两个特定&#34;关联规则的提升值的显着性阈值&#34;以下列方式:
# Calculate lift threshold for N2-QU->N3
ngood_idea <- sum(data.seq == "N3")
nn <- nrow(data.seq)*ncol(data.seq)
p_good_idea <- ngood_idea/nn
x <- seqdef("N2-QU")
p_context <- predict(S1, x, decomp = F, output = "prob")
p_not_context_good_idea <- (1-p_context)*(1-(p_good_idea))
p_context_good_idea <- p_context*p_good_idea
N2_QU_N3_threshold <- 1+1.645*sqrt(((1/nn)*(p_not_context_good_idea/p_context_good_idea)))
# Calculate lift threshold for N2-QU->N1
nbad_idea <- sum(data.seq == "N1")
nn <- nrow(data.seq)*ncol(data.seq)
p_bad_idea <- nbad_idea/nn
p_not_context_bad_idea <- (1-p_context)*(1-(p_bad_idea))
p_context_bad_idea <- p_context*p_bad_idea
N2_QU_N1_threshold <- 1+1.645*sqrt(((1/nn)*(p_not_context_bad_idea/p_context_bad_idea)))
# Print lift thresholds
N2_QU_N3_threshold
N2_QU_N1_threshold
然而,如果我想比较两个升力值,看看它们是否彼此显着不同(以类似于我如何将两个回归系数相互比较以查看它们是否显着的方式,该怎么办?彼此不同)?我怎么能做到这一点?
答案 0 :(得分:1)
利用这个等式:
$Z = \frac{\beta_1-\beta_2}{\sqrt{(SE\beta_1)^2+(SE\beta_2)^2}}$
其中$SE\beta$
是$\beta$
的标准错误。
该等式由Clogg等人(1995)提供
我们可以类推,使用升力作为系数,并根据Lenca等人(2008,p.619)计算每个升力的方差
# Calculate conditional probability for I3
cp_good <- query(S1, context = "N2-QU", output= "prob")@.Data[attr(query(S1, context = "N2-QU", output= "prob")@.Data, "dimnames")[[2]]=="I3"]
cp_good <- unlist(cp_good)
# Calculate conditional probability for I1
cp_bad <- query(S1, context = "N2-QU", output= "prob")@.Data[attr(query(S1, context = "N2-QU", output= "prob")@.Data, "dimnames")[[2]]=="I1"]
cp_bad <- unlist(cp_bad)
# Calculate lift for I3
ngood_idea <- sum(data.seq == "I3")
nn <- nrow(data.seq)*ncol(data.seq)
p_good_idea <- ngood_idea/nn
good_lift <- cp_good/p_good_idea
# Calculate lift for I1
nbad_idea <- sum(data.seq == "I1")
nn <- nrow(data.seq)*ncol(data.seq)
p_bad_idea <- nbad_idea/nn
bad_lift <- cp_bad/p_bad_idea
# Calculate z_diff
p_context <- predict(S1, x, decomp = F, output = "prob")
p_not_context_good_idea <- (1-p_context)*(1-(p_good_idea))
p_context_good_idea <- p_context*p_good_idea
p_not_context_bad_idea <- (1-p_context)*(1-(p_bad_idea))
p_context_bad_idea <- p_context*p_bad_idea
var_good_idea <- ((1/nn)*(p_not_context_good_idea/p_context_good_idea))
var_bad_idea <- ((1/nn)*(p_not_context_bad_idea/p_context_bad_idea))
z_diff <- (good_lift-bad_lift)/sqrt(var_good_idea+var_bad_idea)
z_diff
差异的z值为0.2556881
。
Clogg,C。C.,Petkova,E。,&amp; Haritou,A。(1995)。比较模型间回归系数的统计方法。 美国社会学杂志, 100 (5),1261-1293。]
Lenca,P.,Meyer,P.,Vaillant,B。和Lallich,S。2008.“关于选择关联规则的兴趣度量:用户导向描述和多标准决策辅助,”欧洲运筹学期刊( 184:2),pp.610-626(doi:10.1016 / j.ejor.2006.10.059)。