第 8 章 列联表

8.1 二维列联表的齐性和独立性的\(\chi^2\)检验

8.1.1 齐性

y=matrix(scan("data/wid.txt"),3,2,b=T)
chisq.test(y)
## 
##  Pearson's Chi-squared test
## 
## data:  y
## X-squared = 1.076, df = 2, p-value = 0.5839

8.1.2 独立性

y=matrix(scan("data/shop.txt"),3,3,b=T)
chisq.test(y)
## 
##  Pearson's Chi-squared test
## 
## data:  y
## X-squared = 18.651, df = 4, p-value = 0.0009203
a=loglin(y,list(1,2)) #fit log-linear model
## 2 iterations: deviation 1.421085e-14
pchisq(a$lrt,a$df,low=F)
## [1] 0.000903918

8.1.3 Independence of Two Discrete Random Variables (例子:吸烟与肺癌)

chisq.test(matrix(c(43,13,162,121),2,2),correct=F)
## 
##  Pearson's Chi-squared test
## 
## data:  matrix(c(43, 13, 162, 121), 2, 2)
## X-squared = 7.4688, df = 1, p-value = 0.006278

8.2 8.2 低维列联表的Fisher精确检验

8.2.1 数据为表格形式

x=read.table("data/stroke.txt");
1-phyper(34,60,53,50)
## [1] 0.001176454
phyper(15,53,60,50)
## [1] 0.001176454
chisq.test(x) ##different results
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  x
## X-squared = 9.107, df = 1, p-value = 0.002546
fisher.test(x)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  x
## p-value = 0.002242
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.508528 8.451155
## sample estimates:
## odds ratio 
##   3.504852

8.2.2 数据为数据框格式

x1=read.table("data/strokeA.txt")
attach(x1)
fisher.test(xtabs(V1~.,x1))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  xtabs(V1 ~ ., x1)
## p-value = 0.002242
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.508528 8.451155
## sample estimates:
## odds ratio 
##   3.504852

8.3 8.3 两个比例的比较

8.3.1 例8.3 关于中风的研究

p1=x[1,1]/sum(x[1,]);
p2=x[2,1]/sum(x[2,]);
pdif1=p1-p2;
se1=sqrt(p1*(1-p1)/sum(x[1,])+p2*(1-p2)/sum(x[2,]))
pdifc1=c(p1-p2-1.96*se1,p1-p2+1.96*se1)
rr1=p1/p2;
ser1=sqrt((1-p1)/x[1,1]+(1-p2)/x[2,1]);
rrc1=c(rr1*exp(-1.96*ser1),rr1*exp(1.96*ser1))
or1=(p1/(1-p1))/(p2/(1-p2));seor1=sqrt(sum(1/x))
orc1=c(or1*exp(-1.96*seor1),or1*exp(1.96*seor1))
list(dif=pdif1,difCI=pdifc1,RR=rr1,RRCI=rrc1,OR=or1,ORCI=orc1)
## $dif
## [1] 0.3031746
## 
## $difCI
## [1] 0.1278747 0.4784745
## 
## $RR
## [1] 1.764
## 
## $RRCI
## [1] 1.237586 2.514326
## 
## $OR
## [1] 3.546667
## 
## $ORCI
## [1] 1.613185 7.797522

8.3.2 One-Sample Problems*

#(Squeaky Hip Replacements). 
#As a numerical example, Devore(2012), page 284, 
#reports on a study of 143 subjects who have obtained ceramic hip replacements. 
#Ten of the subjects in the study reported that their hip replacements squeaked. 
#Consider patients who receive such a ceramic hip replacement 
#and let p denote the true proportion of those whose replacement hips develop a squeak.
phat<-10/143
zcv<-qnorm(0.975)
phat+c(-1,1)*zcv*sqrt(phat*(1-phat)/143)
## [1] 0.02813069 0.11172945
#(Left-Handed Professional Ball Players). 
#As an example of this test, 
#consider testing whether the proportion of left-handed professional baseball players is the same as the proportion of left-handed people in the general population, which is about 0.15. 
#For our sample we use the dataset baseball that consists of observations on 59 professional baseball players, including throwing hand (‘L’ or ‘R’).
library(Rfit)
ind<-with(baseball,throw=='L')
prop.test(sum(ind),length(ind),p=0.15,correct=FALSE)
## 
##  1-sample proportions test without continuity correction
## 
## data:  sum(ind) out of length(ind), null probability 0.15
## X-squared = 5.0279, df = 1, p-value = 0.02494
## alternative hypothesis: true p is not equal to 0.15
## 95 percent confidence interval:
##  0.1605598 0.3779614
## sample estimates:
##         p 
## 0.2542373
binom.test(sum(ind),59,p=.15)
## 
##  Exact binomial test
## 
## data:  sum(ind) and 59
## number of successes = 15, number of trials = 59, p-value = 0.04192
## alternative hypothesis: true probability of success is not equal to 0.15
## 95 percent confidence interval:
##  0.1498208 0.3844241
## sample estimates:
## probability of success 
##              0.2542373

8.3.3 Two-Sample Problems*

#(Polio Vaccine). Rasmussen (1992), page 355, 
#discusses one of the original clinical studies for the efficacy of the Salk polio vaccine which took place in 1954. 
#The effectiveness of the vaccine was not known and there were fears that it could even cause polio since the vaccine contained live virus. 
#Children with parental written consent were randomly divided into two groups. #Children in the treatment group (1) were injected with the vaccine while those in the control or placebo group (2) were injected with a biologically inert solution. 
#Let p1 and p2 denote the true proportions of children who get polio in the treatment and control groups, respectively. 
#The hypothesis of interest is the two-sided hypothesis (2.18).
prop.test(c(57,199),c(200745,201229),correct=FALSE)
## 
##  2-sample test for equality of proportions without continuity
##  correction
## 
## data:  c(57, 199) out of c(200745, 201229)
## X-squared = 78.474, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0008608391 -0.0005491224
## sample estimates:
##       prop 1       prop 2 
## 0.0002839423 0.0009889231

8.4 8.4 Cochran-Mantel-Haenszel估计

  • \(2 \times 2 \times K\)列联表
x=read.table("data/hospital.txt");
tmp=array(c(x[,4]),dim=c(2,2,4),dimnames=list(effect=c("Y","N"),
med=c("A","B"),hosptl=c("I", "II","III","IV")));
tab=ftable(. ~ med+effect,tmp);
list(tab,mantelhaen.test(tmp))
## [[1]]
##            hosptl  I II III IV
## med effect                    
## A   Y              8 11   4 19
##     N             21 10   7  7
## B   Y              2  2   1  2
##     N             35 13  22  4
## 
## [[2]]
## 
##  Mantel-Haenszel chi-squared test with continuity correction
## 
## data:  tmp
## Mantel-Haenszel X-squared = 18.674, df = 1, p-value = 1.551e-05
## alternative hypothesis: true common odds ratio is not equal to 1
## 95 percent confidence interval:
##   2.849271 18.094337
## sample estimates:
## common odds ratio 
##          7.180227

8.5 8.5 对数线性模型与高维列联表的独立性

x=read.table("data/wmq.txt",header=T);
xt=xtabs(Count~.,x)
a <- loglin(xt,list(1:2,c(1,3)))
## 2 iterations: deviation 1.421085e-14
a
## $lrt
## [1] 1.099333
## 
## $pearson
## [1] 1.099896
## 
## $df
## [1] 3
## 
## $margin
## $margin[[1]]
## [1] "Capacity" "Area"    
## 
## $margin[[2]]
## [1] "Capacity" "Location"
pchisq(a$lrt,a$df,low=F)
## [1] 0.7772351
pchisq(a$pearson,a$df,low=F)
## [1] 0.7770992
loglin(xt,list(1:2,c(1,3)),para=T)
## 2 iterations: deviation 1.421085e-14
## $lrt
## [1] 1.099333
## 
## $pearson
## [1] 1.099896
## 
## $df
## [1] 3
## 
## $margin
## $margin[[1]]
## [1] "Capacity" "Area"    
## 
## $margin[[2]]
## [1] "Capacity" "Location"
## 
## 
## $param
## $param$`(Intercept)`
## [1] 3.780874
## 
## $param$Capacity
##           L           M           S 
##  0.14228407 -0.12635489 -0.01592918 
## 
## $param$Area
##           N           S 
## -0.03162251  0.03162251 
## 
## $param$Location
##         R         U 
## -0.113156  0.113156 
## 
## $param$Capacity.Area
##         Area
## Capacity           N           S
##        L  0.11474022 -0.11474022
##        M -0.06421120  0.06421120
##        S -0.05052902  0.05052902
## 
## $param$Capacity.Location
##         Location
## Capacity           R           U
##        L  0.25557458 -0.25557458
##        M -0.03440251  0.03440251
##        S -0.22117207  0.22117207
loglin(xt,list(1,2,3))
## 2 iterations: deviation 0
## $lrt
## [1] 26.56823
## 
## $pearson
## [1] 27.29357
## 
## $df
## [1] 7
## 
## $margin
## $margin[[1]]
## [1] "Capacity"
## 
## $margin[[2]]
## [1] "Area"
## 
## $margin[[3]]
## [1] "Location"
loglin(xt,list(1:2,3))
## 2 iterations: deviation 1.421085e-14
## $lrt
## [1] 22.80078
## 
## $pearson
## [1] 22.6113
## 
## $df
## [1] 5
## 
## $margin
## $margin[[1]]
## [1] "Capacity" "Area"    
## 
## $margin[[2]]
## [1] "Location"
loglin(xt,list(1,2:3))
## 2 iterations: deviation 0
## $lrt
## [1] 24.70076
## 
## $pearson
## [1] 24.51553
## 
## $df
## [1] 6
## 
## $margin
## $margin[[1]]
## [1] "Capacity"
## 
## $margin[[2]]
## [1] "Area"     "Location"
loglin(xt,list(2,c(1,3)))
## 2 iterations: deviation 1.421085e-14
## $lrt
## [1] 4.866786
## 
## $pearson
## [1] 4.856742
## 
## $df
## [1] 5
## 
## $margin
## $margin[[1]]
## [1] "Area"
## 
## $margin[[2]]
## [1] "Capacity" "Location"
loglin(xt,list(1:2,c(1,3)))
## 2 iterations: deviation 1.421085e-14
## $lrt
## [1] 1.099333
## 
## $pearson
## [1] 1.099896
## 
## $df
## [1] 3
## 
## $margin
## $margin[[1]]
## [1] "Capacity" "Area"    
## 
## $margin[[2]]
## [1] "Capacity" "Location"
loglin(xt,list(1:2,2:3))
## 2 iterations: deviation 2.842171e-14
## $lrt
## [1] 20.9333
## 
## $pearson
## [1] 20.793
## 
## $df
## [1] 4
## 
## $margin
## $margin[[1]]
## [1] "Capacity" "Area"    
## 
## $margin[[2]]
## [1] "Area"     "Location"
loglin(xt,list(c(1,3),2:3))
## 2 iterations: deviation 0
## $lrt
## [1] 2.999312
## 
## $pearson
## [1] 2.998752
## 
## $df
## [1] 4
## 
## $margin
## $margin[[1]]
## [1] "Capacity" "Location"
## 
## $margin[[2]]
## [1] "Area"     "Location"