R-alpha: `classical' tests for R

Kurt Hornik (Kurt.Hornik@ci.tuwien.ac.at)
Thu, 7 Nov 1996 16:10:42 +0100


Date: Thu, 7 Nov 1996 16:10:42 +0100
Message-Id: <199611071510.QAA25513@aragorn.ci.tuwien.ac.at>
From: Kurt Hornik <Kurt.Hornik@ci.tuwien.ac.at>
To: r-testers@stat.math.ethz.ch
Subject: R-alpha: `classical' tests for R

Attached are drafts of the following classical tests for R:

	chisq.test
	cor.test
	kruskal.test
	mcnemar.test
	var.test
	wilcox.test

They should do the same as their S counterparts, with 3 differences:

* The rank-based tests do not adjust for ties
* There is no exact Wilcoxon test (as there is no pwilcox ...).
* The parameters are returned under the name `parameter', and not
`parameters' as in S (because that's what print.htest is looking for)

I am looking forward to feedback, hopefully there are not too many
errors.

Code for binom.test, friedman.test and prop.test is under way.

If the tests are fine, I can also write documentation for them.

-k

**********************************************************************
chisq.test := function (x, y = NULL, correct = TRUE)
{
  if (is.matrix(x))
    {
      if ((nrow(x) < 2) || (ncol(x) < 2))
	stop("x must have at least 2 rows and columns")
      if (any(x < 0) || any(is.na(x)))
	stop("all entries of x must be nonnegative and finite")
      DNAME <- deparse(substitute(x))
    }
  else
    {
      if (is.null(y))
	stop("if x is not a matrix, y must be given")
      if (length(x) != length(y))
	stop("x and y must have the same length")
      DNAME <- paste(deparse(substitute(x)), "and",
		     deparse(substitute(y)))
      OK <- complete.cases(x, y)
      x <- as.factor(x[OK])
      y <- as.factor(y[OK])
      if ((nlevels(x) < 2) || (nlevels(y)  < 2))
	stop("x and y must have at least 2 levels")
      x <- table(x, y)
    }
  row.totals <- apply(x, 1, sum)
  col.totals <- apply(x, 2, sum)
  E <- outer(row.totals, col.totals, "*") / sum(x)
  if (any(E < 5))
    warning("Chi-square approximation may be incorrect") 
  PARAMETER <- (nrow(x) - 1) * (ncol(x) - 1)
  names(PARAMETER) <- "df"
  METHOD <- "Pearson's Chi-square test"
  if (correct && nrow(x) == 2 && ncol(x) == 2)
    {
      YATES <- .5
      METHOD <- paste(METHOD, "with Yates' continuity correction")
    }
  else
    YATES <- 0
  STATISTIC <- sum((abs(x - E) - YATES)^2 / E)
  names(STATISTIC) <- "X-squared"
  PVAL <- 1 - pchisq(STATISTIC, PARAMETER)

  RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, p.value =
	       PVAL, method = METHOD, data.name = DNAME)
  class(RVAL) <- "htest"
  return(RVAL)
}
cor.test := function (x, y, alternative = "two.sided", method = "pearson") 
{
  CHOICES <- c("two.sided", "less", "greater")
  alternative <- CHOICES[pmatch(alternative, CHOICES)]
  if (length(alternative) > 1 || is.na(alternative)) 
    stop("alternative must be \"two.sided\", \"less\" or \"greater\"")
  CHOICES <- c("pearson", "kendall", "spearman")
  method <- CHOICES[pmatch(method, CHOICES)]
  if (length(method) > 1 || is.na(method)) 
    stop("method must be \"pearson\", \"kendall\" or \"spearman\"")

  DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))

  if (length (x) != length (y))
    stop ("x and y must have the same length")
  FINITE <- is.finite(x) & is.finite(y)
  x <- x[FINITE]
  n <- length (x)
  if (n < 3)
    stop ("not enough finite observations")
  else
    y <- y[FINITE]

  NVAL <- 0

  if (method == "pearson")
    {
      method <- "Pearson's product-moment correlation"
      names(NVAL) <- "correlation"
      r <- cor (x, y)
      ESTIMATE <- r
      names(ESTIMATE) <- "cor"
      PARAMETER <- n - 2
      names(PARAMETER) <- "df"
      STATISTIC <- sqrt (PARAMETER) * r / sqrt (1 - r^2)
      names(STATISTIC) <- "t"
      PVAL <- pt (STATISTIC, PARAMETER)
    }
  else
    {
      if (method == "kendall")
	{
	  method <- "Kendall's rank correlation tau"
	  names(NVAL) <- "tau"
	  x <- rank(x)
	  y <- rank(y)
	  ESTIMATE <- cor (c(sign(outer(x, x, "-"))),
			   c(sign(outer(y, y, "-"))))
	  names(ESTIMATE) <- "tau"
	  STATISTIC <- ESTIMATE / sqrt ((4*n + 10) / (9 * n * (n-1)))
	}
      else
	{
	  method <- "Spearman's rank correlation rho"
	  names(NVAL) <- "rho"
	  ESTIMATE <- cor (rank(x), rank(y))
	  names(ESTIMATE) <- "rho"
	  STATISTIC <- sqrt (n-1) * (ESTIMATE - 6 / (n^3 - n))
	}
      PARAMETER <- NULL
      names(STATISTIC) <- "z"
      PVAL <- pnorm (STATISTIC)
    }

  if (alternative == "two.sided")
    PVAL <- 2 * min (PVAL, 1 - PVAL)
  else if (alternative == "greater")
    PVAL <- 1 - PVAL

  RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, p.value =
	       PVAL, estimate = ESTIMATE, null.value = NVAL, alternative
	       = alternative, method = method, data.name = DNAME)
  class(RVAL) <- "htest"
  return(RVAL)
}
kruskal.test := function (y, groups)
{
  if (length(y) != length(groups))
    stop("y and groups must have the same length")
  DNAME <- paste(deparse(substitute(y)), "and",
		 deparse(substitute(groups)))
  
  if (!all(is.finite(groups)))
    stop("all group levels must be finite")
  OK <- !is.na(y)
  y <- y[OK]
  n <- length(y)
  if (n < 2)
    stop("not enough observations")
  g <- as.factor(groups[OK])
  PARAMETER <- nlevels(g) - 1
  if (PARAMETER < 1)
    stop("all observations are in the same group")
  names(PARAMETER) <- "df"

  r <- rank(y)
  STATISTIC <- (12 * sum(tapply(r, g, "sum")^2 /
			 tapply(r, g, "length")) / (n*(n+1)) - 3*(n+1))
  names(STATISTIC) <- "Kruskal-Wallis chi-square"
  PVAL <- 1 - pchisq(STATISTIC, PARAMETER)
  
  RVAL <- list (statistic = STATISTIC, parameter = PARAMETER, p.value =
		PVAL, alternative = "two.sided", method =
		"Kruskal-Wallis rank sum test", data.name = DNAME)
  class(RVAL) <- "htest"
  return(RVAL)
}
mcnemar.test := function (x, y = NULL, correct = TRUE)
{
  if (is.matrix(x))
    {
      r <- nrow(x)
      if ((r < 2) || (ncol (x) != r))
	stop("x must be square with at least two rows and columns")
      if (any(x < 0) || any(is.na(x)))
	stop("all entries of x must be nonnegative and finite")
      DNAME <- deparse(substitute(x))
    }
  else
    {
      if (is.null(y))
	stop("if x is not a matrix, y must be given")
      if (length(x) != length(y))
	stop("x and y must have the same length")
      DNAME <- paste(deparse(substitute(x)), "and",
		     deparse(substitute(y)))
      OK <- complete.cases(x, y)
      x <- as.factor(x[OK])
      y <- as.factor(y[OK])
      r <- nlevels(x)
      if ((r < 2) || (nlevels(y) != r))
	stop("x and y must have the same number of levels (minimum 2)")
      x <- table(x, y)
    }

  PARAMETER <- r * (r-1) / 2
  names(PARAMETER) <- "df"
  METHOD <- "McNemar's Chi-square test"

  if (correct && (r == 2) && any(x - t(x)))
    {
      y <- (abs(x - t(x)) - 1)
      METHOD <- paste(METHOD, "with continuity correction")
    }
  else
    y <- x - t(x)
  x <- x + t(x)

  STATISTIC <- sum(y[upper.tri(x)]^2 / x[upper.tri(x)])
  names(STATISTIC) <- "McNemar's chi-square"
  PVAL <- 1 - pchisq(STATISTIC, PARAMETER)  
    
  RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, p.value =
	       PVAL, method = METHOD, data.name = DNAME)
  class(RVAL) <- "htest"
  return(RVAL)
}
var.test := function (x, y, alternative = "two.sided", conf.level = 0.95) 
{
  CHOICES <- c("two.sided", "less", "greater")
  alternative <- CHOICES[pmatch(alternative, CHOICES)]
  if (length(alternative) > 1 || is.na(alternative)) 
    stop("alternative must be \"two.sided\", \"less\" or \"greater\"")
  if (!missing(conf.level) &&
      (length(conf.level) > 1 || is.na(conf.level) ||
       (conf.level <= 0) || (conf.level >= 1)))  
    stop("conf.level must be a single number between 0 and 1")

  DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(y)))

  x <- x[is.finite(x)]
  DF.x <- length(x) - 1
  if (DF.x < 1) 
    stop("not enough x observations")
  y <- y[is.finite(y)]
  DF.y <- length(y) - 1
  if (DF.y < 1) 
    stop("not enough y observations")

  V.x <- var(x)
  V.y <- var(y)
  STATISTIC <- V.x/V.y
  PARAMETER <- c(DF.x, DF.y)
  ESTIMATE <- c(V.x, V.y)
  PVAL <- pf(STATISTIC, DF.x, DF.y)
  NVAL <- 1
  if (alternative == "two.sided")
    {
      PVAL <- 2 * min(PVAL, 1 - PVAL)
      BETA <- (1 - conf.level) / 2
      CINT <- c(STATISTIC / qf(1 - BETA, DF.x, DF.y),
		STATISTIC / qf(BETA, DF.x, DF.y))
    }
  else if (alternative == "greater")
    {
      PVAL <- 1 - PVAL
      CINT <- c(STATISTIC / qf(conf.level, DF.x, DF.y), NA)
    }
  else CINT <- c(0, STATISTIC / qf(1 - conf.level, DF.x, DF.y))
  names(STATISTIC) <- "F"
  names(PARAMETER) <- c("num df", "denom df")
  names(ESTIMATE) <- c("variance of x", "variance of y")
  names(NVAL) <- "ratio of variances"
  attr(CINT, "conf.level") <- conf.level
  RVAL <- list(statistic = STATISTIC, parameter = PARAMETER,
	       p.value = PVAL, conf.int = CINT, estimate = ESTIMATE,
	       null.value = NVAL, alternative = alternative,
	       method = "F test for equal variances", data.name = DNAME)
  attr(RVAL, "class") <- "htest"
  return(RVAL)
}
wilcox.test := function(x, y = NULL, alternative = "two.sided", mu = 0,
			paired = F, exact = F, correct = T) 
{
  CHOICES <- c("two.sided", "less", "greater")
  alternative <- CHOICES[pmatch(alternative, CHOICES)]
  if (length(alternative) > 1 || is.na(alternative)) 
    stop("alternative must be \"two.sided\", \"less\" or \"greater\"")

  if (!missing(mu) && ((length(mu) > 1) || !is.finite(mu)))
    stop("mu must be a single number")

  if (!is.null(y))
    {
      DNAME <- paste(deparse(substitute(x)), "and",
		     deparse(substitute(y)))
      if (paired)
	{
	  if (length(x) != length(y))
	    stop("x and y must have the same length")
	  OK <- complete.cases (x, y)
	  x <- x[OK] - y[OK]
	  y <- NULL
	}
      else
	{
	  x <- x[is.finite(x)]
	  y <- y[is.finite(y)]
	}
    }
  else
    {
      DNAME <- deparse(substitute(x))
      if (paired)
	stop("y missing for paired test")
      x <- x[is.finite(x)]
    }

  if (length(x) < 1)
    stop("not enough x observations")

  if (exact)
    warning("exact Wilcoxon tests are currently not implemented\n")

  PARAMETER <- NULL
  
  CORRECTION <- 0  

  if (is.null(y))
    {
      METHOD <- "Wilcoxon signed rank test"
      x <- x - mu
      STATISTIC <- sum(rank(abs(x))[x > 0])
      names(STATISTIC) <- "V"
      n <- length(x)
      z <- STATISTIC - n*(n+1)/4
      SIGMA <- sqrt(n*(n+1)*(2*n+1)/24)
    }
  else
    {
      if (length(y) < 1)
	stop("not enough y observations")
      METHOD <- "Wilcoxon rank sum test"
      STATISTIC <- sum(rank(c(x - mu, y))[seq(along = x)])
      names(STATISTIC) <- "W"
      n.x <- length(x)
      n.y <- length(y)
      z <- STATISTIC - n.x*(n.x+n.y+1)/2
      SIGMA <- sqrt(n.x*n.y*(n.x+n.y+1)/12)
    }

  if (correct)
    {
      CORRECTION <- switch ("two.sided", sign(z) * 0.5,
			    "greater", 0.5,
			    "less", -0.5)
      METHOD <- paste(METHOD, "with continuity correction")
    }
  
  PVAL <- pnorm((z - CORRECTION) / SIGMA)
  if (alternative == "two.sided")
    PVAL <- 2 * min (PVAL, 1 - PVAL)
  else if (alternative == "greater")
    PVAL <- 1 - PVAL

  NVAL <- mu
  names(NVAL) <- "mu"

  RVAL <- list(statistic = STATISTIC, parameter = PARAMETER, p.value =
	       PVAL, null.value = NVAL, alternative = alternative,
	       method = METHOD, data.name = DNAME)
  class(RVAL) <- "htest"
  return(RVAL)
}
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
r-testers mailing list -- To (un)subscribe, send
subscribe	or	unsubscribe
(in the "body", not the subject !)  To: r-testers-request@stat.math.ethz.ch
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-