From: Bob <thebobs_at_mistral.co.uk>

Date: Thu, 17 Jan 2008 18:59:26 -0000

D=ab*cd*ac*bd

if(D==0)x2=NA else x2=n*(abs(a*d-b*c)-C)^2/D x2}

pl=rank(c(q,Q),ties.method='max')[1]/(length(Q)+1) pe=sum(c(q,Q)==q)/(length(Q)+1);pu=1-pl+pe cat('with cc P = ',pu,'\n')

q=x2(x)

pl=rank(c(q,Q),ties.method='max')[1]/(length(Q)+1) pe=sum(c(q,Q)==q)/(length(Q)+1);pu=1-pl+pe cat('conventional-P = ',pu,'\n')

cat('mid-P = ',pu-pe/2,'\n')}

R-help_at_r-project.org mailing list

https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code. Received on Thu 17 Jan 2008 - 19:13:00 GMT

Date: Thu, 17 Jan 2008 18:59:26 -0000

R Help on 'chisq.test' states that

In the contingency table case this is done by random sampling from the set of all contingency tables with given marginals, and works only if the marginals are positive... In the goodness-of-fit case this is done by random sampling from the discrete distribution specified by 'p', each sample being of size 'n = sum(x)'."

The last paragraph suggests that in the goodness-of-fit case, if p gives the

expected probability for each cell, this random sampling is multinomial.

Unfortunately, as the following examples reveal, the sampling model is neither

multinomial nor hypergeometric - at least when it is applied to a 4-fold table.

This is rather sad as some people assume that R's chisq.test function can

perform a Monte Carlo test of X-squared, employing a multinomial model - in

other words, assuming that your data are a single random sample.

### Example 1.

*> x=matrix(c(1,2,3,4),nc=2)
**> # To begin with, let us apply the large-sample approximations
**> chisq.test(x,correct=TRUE)$p.value
*

[1] 0.6726038

Warning message:

Chi-squared approximation may be incorrect in: chisq.test(x, correct = TRUE)

*> chisq.test(x,correct=FALSE)$p.value
*

[1] 0.7781597

Warning message:

Chi-squared approximation may be incorrect in: chisq.test(x, correct =
**FALSE)
**

*>
**> # So let us apply a 2-tailed test of O.R.=1, using a hypergeometric model
*

> fisher.test(x)$p.value

[1] 1

*> # This should also apply a hypergeometric model
**> chisq.test(x,simulate.p.value=TRUE,B=500000)$p.value
*

[1] 1

*>
**> # Now we work out the expected probability for each cell
**> p=outer(c(sum(x[1,]),sum(x[2,])),c(sum(x[,1]),sum(x[,2])))/sum(x)^2
**> # But this applies a hypergeometric model, presumably because p is not
**> scalar
**> chisq.test(x,p=p,simulate.p.value=TRUE,B=500000)$p.value
*

[1] 1

*> # This seems to do something different,
**> # at any rate it is much slower, and needs more memory
**> chisq.test(x[1:4],p=p[1:4],simulate.p.value=TRUE,B=10000)$p.value
*

[1] 1

*> # Which would appear to be using the same model as above
**>
**> # Now let us apply an X2 test using a multinomial model
**> # (The code for this x2.test function is in Appendix 1, below.)
**> x2.test(x,R=200000)
*

with cc P = 0.7316812

conventional-P = 0.8838786

mid-P = 0.8423058

*>
**> # All of these P-values are higher than those given by the Chi-squared
*

approximation,

*> # but they certainly do not equal 1.
**> # But is this is an artefact of our very small sample?
**>
**>
**>
**>
**> ### Example 2.
**> # Let us try a larger sample
**> x=matrix(c(56,35,23,42),nc=2)
**>
**> # First we apply the asymptotic model
**> chisq.test(x,correct=TRUE)$p.value
*

[1] 0.002222425

*> chisq.test(x,correct=FALSE)$p.value
*

[1] 0.001276595

*>
**> # Now for the hypergeometric (fixed margin totals model)
*

> fisher.test(x)$p.value

[1] 0.001931078

*> chisq.test(x,simulate.p.value=TRUE,B=500000)$p.value
*

[1] 0.001913996

*> p=outer(c(sum(x[1,]),sum(x[2,])),c(sum(x[,1]),sum(x[,2])))/sum(x)^2
**> chisq.test(x,p=p,simulate.p.value=TRUE,B=500000)$p.value
*

[1] 0.001891996

*>
**> Next comes what we had hoped to be a multinomial test
**> chisq.test(x[1:4],p=p[1:4],simulate.p.value=TRUE,B=10000)$p.value
*

[1] 0.01639836

*> # This is obviously not the same hypergeometric model as used for a > #
*

chi-squared test.

*> # The P-value is about 10x of the approximate tests (above)
**> # or the exact tests (below).
**>
**> x2.test(x,R=200000)
*

with cc P = 0.002059990

conventional-P = 0.001184994

mid-P = 0.001172494

*>
**> # Whatever that chi-squared test model IS, it is certainly not
**> multinomial!
**> # Could it possibly be Poisson and, if so, why???
*

######## Appendix 1:

# We have used these functions to do a 2x2 multinomial test of X2:

x2=function(y,cc=FALSE){

y=y*1.;n=sum(y);C=cc*n/2 a=y[1];b=y[2];c=y[3];d=y[4] ab=a+b;cd=c+d;ac=a+c;bd=b+d

D=ab*cd*ac*bd

if(D==0)x2=NA else x2=n*(abs(a*d-b*c)-C)^2/D x2}

x2.test=function(x,R=5000){

n=sum(x) p=outer(c(sum(x[1,]),sum(x[2,])),c(sum(x[,1]),sum(x[,2])))/n/n Q=sort(apply(rmultinom(R,n,p),2,x2)) q=x2(x,cc=TRUE)

pl=rank(c(q,Q),ties.method='max')[1]/(length(Q)+1) pe=sum(c(q,Q)==q)/(length(Q)+1);pu=1-pl+pe cat('with cc P = ',pu,'\n')

q=x2(x)

pl=rank(c(q,Q),ties.method='max')[1]/(length(Q)+1) pe=sum(c(q,Q)==q)/(length(Q)+1);pu=1-pl+pe cat('conventional-P = ',pu,'\n')

cat('mid-P = ',pu-pe/2,'\n')}

Bob

R-help_at_r-project.org mailing list

https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code. Received on Thu 17 Jan 2008 - 19:13:00 GMT

Archive maintained by Robert King, hosted by
the discipline of
statistics at the
University of Newcastle,
Australia.

Archive generated by hypermail 2.2.0, at Thu 17 Jan 2008 - 21:30:07 GMT.

*
Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help.
Please read the posting
guide before posting to the list.
*