R-alpha: glm bugs

Ross Ihaka (ihaka@stat.auckland.ac.nz)
Tue, 7 Jan 1997 10:50:16 +1300 (NZDT)


From: Ross Ihaka <ihaka@stat.auckland.ac.nz>
Date: Tue, 7 Jan 1997 10:50:16 +1300 (NZDT)
Message-Id: <199701062150.KAA10367@stat13.stat.auckland.ac.nz>
To: r-testers@stat.math.ethz.ch (r-testers)
Subject: R-alpha: glm bugs
In-Reply-To: <9612230843.AA28147@alpha.luc.ac.be>

Jim Lindsey writes:
 > y <- rpois(20,5)
 > x1 <- gl(5,4,20)
 > x2 <- gl(4,1,20)
 > glm(y~x1+x2,family=poisson)  #works
 > glm(y~x1+x2,family=poisson,weights=as.numeric(x2!=1))
 > Error in numeric(nr): negative length vectors are not allowed.
 > # but works if a whole row or column is not weighted out
 > glm(y~x1+x2+as.numeric(x1):as.numeric(x2),family=poisson) #works
 > glm(y~x1+x2+x1:as.numeric(x2),family=poisson)
 > Error in numeric(nr): negative length vectors are not allowed.

[Discontinuity ...]

This is one of a number of bugs in glm tripped when a non full-rank
design matrix is encountered.  The following code should be used to
replace the file "src/library/base/funs/glm".

I tracking these down I noticed a number of computation/memory
inefficiencies.  I have not fixed them yet, but it should be possible
to improve the performance of glm quite a bit.
	Ross

- - - s n i p - - h e r e - - -

glm <- function(formula, family=gaussian, data, weights=NULL,
	subset=NULL, na.action=na.fail, start=NULL, offset=NULL,
	control=glm.control(epsilon=0.0001, maxit=10, trace=FALSE),
	model=TRUE, method=glm.fit, x=FALSE, y=TRUE)
{
	call <- sys.call()

	# family

	if(is.character(family)) family <- get(family)
	if(is.function(family)) family <- family()
	if(is.null(family$family)) stop("family not recognised")
		
	# extract x, y, etc from the model formula and frame

	mt <- terms(formula)
	if(missing(data)) data <- sys.frame(sys.parent())
        mf <- match.call()
        mf$family <- mf$start <- mf$control <- mf$maxit <- NULL
	mf$model <- mf$method <- mf$x <- mf$y <- NULL
        mf$use.data <- TRUE
        mf[[1]] <- as.name("model.frame")
        mf <- eval(mf, sys.frame(sys.parent()))
	X <- model.matrix(mt, mf)
	Y <- model.response(mf, "numeric")
	weights <- model.weights(mf)
	offset <- model.offset(mf)

	# check weights and offset

	if( !is.null(weights) && any(weights<0) )
		stop("Negative wts not allowed")
	if(!is.null(offset) && length(offset) != NROW(Y))
		stop(paste("Number of offsets is", length(offset),
			", should equal", NROW(Y), "(number of observations)"))

	# fit model via iterative reweighted least squares

	fit <- method(x=X, y=Y, weights=weights, start=start,
		offset=offset, family=family, control=control)

	# output

	if(model) fit$model <- mf
	if(x) fit$x <- X
	else fit$x <- NULL
	if(!y) fit$y <- NULL

	fit <- c(fit, list(call=call, formula=formula, x=x, terms=mt, data=data,
		offset=offset, control=control, method=method))
	class(fit) <- c("glm", "lm")
	return(fit)
}


glm.control <- function(epsilon = 0.0001, maxit = 10, trace = FALSE)
{
	if(epsilon <= 0)
		stop("value of epsilon must be > 0")
	if(maxit <= 0)
		stop("maximum number of iterations must be > 0")
	return(list(epsilon = epsilon, maxit = maxit, trace = trace))
}


glm.fit <- function(x, y, weights=rep(1, nobs), start=NULL,
	offset=rep(0, nobs), family=gaussian(), control=glm.control(),
	intercept=TRUE)
{
	xnames <- dimnames(x)[[2]]
	ynames <- names(y)

	# this function fits a generalized linear model via
	# iterative reweighted least squares for any family

	conv <- FALSE
	nobs <- NROW(y)
	nvars <- NCOL(x)

	# define weights and offset if needed

	if(is.null(weights)) weights <- rep(1, nobs)
	if(is.null(offset)) offset <- rep(0, nobs)

	# get family functions

	variance <- family$variance
	dev.resids <- family$dev.resids
	linkinv <- family$linkinv
	mu.eta <- family$mu.eta
	eval(family$initialize, sys.frame(sys.nframe()))
	if(NCOL(y) > 1)	stop("y must be univariate unless binomial")

	# calculates the first estimate of eta and mu

	if (is.null(start)) {
		linkfun <- family$linkfun
		etastart <- linkfun(mustart)
		z <- etastart + (y - mustart)/mu.eta(etastart) - offset
		w <- ((weights * mu.eta(etastart)^2)/variance(mustart))^0.5
		fit <- qr(x * w)
		eta <- qr.fitted(fit, w * z)
	}
	else {
		if (length(start) != nvars)
			stop(paste("Length of start should equal", nvars,
				"and correspond to initial coefs for",
				deparse(xnames)))
		if(NCOL(x)==1) eta <- as.vector(x*start)
		else eta <- as.vector(x %*% start)
	}
	mu <- linkinv(eta + offset)

	# calculate initial deviance and coefficient

	devold <- sum(dev.resids(y, mu, weights))
	coefold <- start

	# do iteration

	for(iter in 1:control$maxit) {

		# calculate z and w using only values where mu.eta != 0

		mu.eta.val <- mu.eta(eta+offset)
		good <- mu.eta.val!=0
		z <- eta[good] + (y - mu)[good]/mu.eta.val[good]
		w <- ((weights * mu.eta.val^2)[good]/variance(mu)[good])^0.5
		x <- as.matrix(x)
		ngoodobs <- as.integer(nobs-sum(!good))
		ncols <- as.integer(1)

		# call linpack code

		fit <- .Fortran("dqrls",
			qr=x[good, ]*w,
			n=as.integer(ngoodobs),
			p=nvars,
			y=w*z,
			ny=ncols,
			tol=1e-07,
			coefficients = mat.or.vec(nvars, 1),
			residuals = mat.or.vec(ngoodobs, 1),
			effects = mat.or.vec(ngoodobs, 1),
			rank = integer(1),
			pivot = as.integer(1:nvars),
			qraux = double(nvars),
			work = double(2 * nvars)
		)

		# stop if not enough parameters

		if(nobs < fit$rank)
			stop(paste("X matrix has rank", fit$rank, "but only",
				nobs, "observations"))

		# if X matrix was not full rank then columns
		# were pivoted, hence we need to re-label the names

		if( fit$rank != nvars ) {
			xnames <- xnames[fit$pivot]
			dimnames(fit$qr) <- list(NULL, xnames)
		}

		# calculate updated values of eta and mu with the new coef

		coef <- fit$coefficients
		if(nvars==1)	eta[good] <- x[good]*coef
		else	eta[good] <- as.vector(x[good, ] %*% coef)
		mu <- linkinv(eta + offset)
		if(family$family=="binomial")
			if(any(mu==1) || any(mu==0))
				stop("fitted probabilities of 0 or 1 occured")
		dev <- sum(dev.resids(y, mu, weights))

		if(control$trace)
			cat("Deviance =", dev, "Iterations -", iter, "\n")

		# check for divergence

		if(any(is.na(dev)) || any(is.na(coef))) {
			warning("Iteration truncated due to divergence")
			break
		}

		# check for convergence

#		if(abs((dev-devold)/sum(dev^2)) < control$epsilon
#		 & abs((coef-coefold)/sum(coef^2)) < control$epsilon) {
#			conv <- TRUE
#			break
#		}
		if(abs(dev-devold)/(0.1+abs(dev)) < control$epsilon) {
			conv <- TRUE
			break
		}
		else {
			devold <- dev
			coefold <- coef
		}
	}

	if(!conv) warning("Algorithm did not converge")

	# calculate residuals

	residuals <- rep(NA, nobs)
	residuals[good] <- z - eta

	# name output

	fit$qr <- as.matrix(fit$qr)
	Rmat <- fit$qr[1:nvars, 1:nvars]
	Rmat <- as.matrix(Rmat)
	Rmat[row(Rmat) > col(Rmat)] <- 0
	names(coef) <- xnames
	colnames(fit$qr) <- xnames
	dimnames(Rmat) <- list(xnames, xnames)
	names(residuals) <- ynames
	names(mu) <- ynames
	names(eta) <- ynames
	names(w) <- ynames
	names(weights) <- ynames
	names(y) <- ynames

	# calculate null deviance

	if (intercept)	wtdmu <- sum(weights * y)/sum(weights)
	else wtdmu <- linkinv(offset)
	nulldev <- sum(dev.resids(y, wtdmu, weights))

	# calculate df

	nulldf <- nobs - as.numeric(intercept)
	resdf <- nobs - fit$rank - sum(weights==0)

	qr <- list(qr=fit$qr, rank=fit$rank, qraux=fit$qraux)

	return(list(
		coefficients=coef,
		residuals=residuals,
		fitted.values=mu,
		effects=fit$effects,
		R=Rmat,
		rank=fit$rank,
		qr=qr,
		family=family,
		linear.predictors=eta,
		deviance=dev,
		null.deviance=nulldev,
		iter=iter,
		weights=w^2,
		prior.weights=weights,
		df.residual=resdf,
		df.null=nulldf,
		y=y))
}


print.glm <- function (x, digits = max(3, .Options$digits - 3),
	na.print="", ...)
{
	cat("\nCall: ", deparse(x$call), "\n\n")
	cat("Coefficients:\n")
	print.default(round(x$coefficients, digits), print.gap = 2)
	cat("\nDegrees of Freedom:", length(x$residuals), "Total;",
		 x$df.residual, "Residual\n")
	cat("Null Deviance:", format(signif(x$null.deviance, digits)), "\n")
	cat("Residual Deviance:", format(signif(x$deviance, digits)), "\n")
	invisible(x)
}


anova.glm <- function(object, ..., test=NULL, na.action=na.omit)
{
	# check for multiple objects

	args <- function(...) nargs()
	if(args(...)) return(anova.glmlist(list(object, ...), test=test))

	# extract variables from model

	varlist <- attr(object$terms, "variables")
	if(!is.null(object$x) && !(is.logical(object$x) || 
		object$x==FALSE)) x <- object$x
	else {
		if(is.null(object$model)) {
			if(is.null(object$data))	
				object$data <- sys.frame(sys.parent())
			object$model <- na.action(
				model.frame(eval(varlist, object$data),
					as.character(varlist[-1]), NULL))
		}
		x <- model.matrix(object$terms, object$model)
	}
	varseq <- attr(x, "assign")
	nvars <- max(varseq)
	resdev <- resdf <- NULL

	# if there is more than one explanatory variable then
	# recall glm.fit to fit variables sequentially

	if(nvars > 1) {
		for(i in 1:(nvars-1)) {

			# explanatory variables up to i are kept in the model

			tempx <- x[, varseq <= i]

			# use method from glm to find residual deviance
			# and df for each sequential fit

			method <- object$method
			fit <- method(x=tempx, y=object$y,
				weights=object$prior.weights,
				start=object$start,
				offset=object$offset,
				family=object$family,
				control=object$control)
			resdev <- c(resdev, fit$deviance)
			resdf <- c(resdf, fit$df.residual)
		}
	}

	# add values from null and full model

	resdf <- c(object$df.null, resdf, object$df.residual)
	resdev <- c(object$null.deviance, resdev, object$deviance)

	# construct table and title

	table <- cbind(c(NA, -diff(resdf)), c(NA, -diff(resdev)), resdf, resdev)
	dimnames(table) <- list(c("NULL", attr(object$terms, "term.labels")),
				c("Df", "Deviance", "Resid. Df", "Resid. Dev"))
	title <- paste("Analysis of Deviance Table", "\n\nModel: ",
		object$family$family, ", link: ", object$family$link,
		"\n\nResponse: ", as.character(varlist[-1])[1],
		"\n\nTerms added sequentially (first to last)\n\n", sep="")

	# calculate test statistics if needed

	if(!is.null(test))
		table <- stat.anova(table=table, test=test, scale=sum(
			object$weights*object$residuals^2)/object$df.residual,
			df.scale=object$df.residual, n=NROW(x))

	# return output

	output <- list(title=title, table=table)
	class(output) <- "anova.glm"
	return(output)
}


anova.glmlist <- function(object, ..., test=NULL)
{

	# find responses for all models and remove
	# any models with a different response

	responses <- as.character(lapply(object, function(x) {
			as.character(x$formula[2])} ))
	sameresp <- responses==responses[1]
	if(!all(sameresp)) {
		object <- object[sameresp]
		warning(paste("Models with response", deparse(responses[
			!sameresp]), "removed because response differs from",
			"model 1"))
	}

	# calculate the number of models

	nmodels <- length(object)
	if(nmodels==1)	return(anova.glm(object[[1]], ..., test))

	# extract statistics

	resdf <- as.numeric(lapply(object, function(x) x$df.residual))
	resdev <- as.numeric(lapply(object, function(x) x$deviance))

	# construct table and title

	table <- cbind(resdf, resdev, c(NA, -diff(resdf)), c(NA, -diff(resdev)))
	variables <- as.character(lapply(object, function(x) {
			as.character(x$formula[3])} ))
	dimnames(table) <- list(variables, c("Resid. Df", "Resid. Dev", "Df",
				"Deviance"))
	title <- paste("Analysis of Deviance Table \n\nResponse: ",
			responses[1], "\n\n", sep="")

	# calculate test statistic if needed

	if(!is.null(test)) {
		bigmodel <- object[[(order(resdf)[1])]]
		table <- stat.anova(table=table, test=test, scale=sum(
			bigmodel$weights * bigmodel$residuals^2)/
			bigmodel$df.residual, df.scale=min(resdf),
			n=length(bigmodel$residuals))
	}

	# return output

	output <- list(table=table, title=title)
	class(output) <- "anova.glm"
	return(output)
}

	
stat.anova <- function(table, test, scale, df.scale, n)
{
	testnum <- match(test, c("Chisq", "F", "Cp"))
	cnames <- colnames(table)
	rnames <- rownames(table)
	if(is.na(testnum))
		stop(paste("Test \"", test, "\" not recognised", sep=""))
	if(testnum==1) {
		chisq <- 1-pchisq(abs(table[, "Deviance"]), abs(table[, "Df"]))
		table <- cbind(table, chisq)
		dimnames(table) <- list(rnames, c(cnames, "P(>|Chi|)"))
	}
	if(testnum==2) {
		Fvalue <- abs((table[, "Deviance"]/table[, "Df"])/scale)
		pvalue <- 1-pf(Fvalue, abs(table[, "Df"]), abs(df.scale))
		table <- cbind(table, Fvalue, pvalue)
		dimnames(table) <- list(rnames, c(cnames, "F", "Pr(>F)"))
	}
	if(testnum==3) {
		Cp <- table[, "Resid. Dev"] + 2*scale*(n - table[, "Resid. Df"])
		table <- cbind(table, Cp)
		dimnames(table) <- list(rnames, c(cnames, "Cp"))
	}
	return(table)
}


summary.glm <- function(object, dispersion = NULL,
	correlation = TRUE, na.action=na.omit)
{

	# calculate dispersion if needed

	if(is.null(dispersion)) {
		if(any(object$family$family == c("poisson", "binomial")))
			dispersion <- 1
		else {
			if(any(object$weights==0))
				warning(paste("observations with zero weight",
				"not used for calculating dispersion"))
			dispersion <- sum(object$weights*object$residuals^2)/
					object$df.residual
		}
	}

	# extract x to get column names

	if(is.null(object$x)) {
		if(is.null(object$model)) {
			varlist <- attr(object$terms, "variables")
			if(is.null(object$data))	
				object$data <- sys.frame(sys.parent())
			object$model <- na.action(model.frame(eval(varlist,
				object$data), as.character(varlist[-1]), NULL))
		}
		object$x <- model.matrix(object$terms, object$model)
	}

	# calculate scaled and unscaled covariance matrix

	Rinv <- solve(object$R)
	covmat.unscaled <- as.matrix(Rinv %*% t(Rinv))
	dimnames(covmat.unscaled) <- list(names(object$coefficients),
			names(object$coefficients))
	covmat <- dispersion*covmat.unscaled
	dimnames(covmat) <- dimnames(covmat.unscaled)

	# calculate coef table

	nas <- is.na(object$coefficients)
	tvalue <- object$coefficients/diag(covmat)^0.5
	pvalue <- 2*(1-pt(abs(tvalue), object$df.residual))
	coef.table <- cbind(object$coefficients, diag(covmat)^0.5,
		tvalue, pvalue)
	dimnames(coef.table) <- list(names(object$coefficients),
				c("Value", "Std.error", "t value", "P(>|t|)"))

	# return answer

	ans <- list(
		call=object$call,
		terms=object$terms,
		family=object$family,
		deviance.resid=residuals(object, type = "deviance"),
		coefficients=coef.table,
		dispersion=dispersion,
		df=c(object$rank, object$df.residual),
		deviance=object$deviance,
		df.residual=object$df.residual,
		null.deviance=object$null.deviance,
		df.null=object$df.null,
		iter=object$iter,
		cov.unscaled=covmat.unscaled,
		cov.scaled=covmat,
		nas=nas)

	if(correlation) {
		ans$correlation <- as.matrix(covmat/(outer(diag(covmat),
			diag(covmat))^0.5))
	}
	class(ans) <- "summary.glm"
	return(ans)
}


print.summary.glm <- function (x, digits = max(3, .Options$digits - 3),
	na.print="", ...)
{
        cat("\nCall:\n")
        cat(paste(deparse(x$call), sep="\n", collapse="\n"), "\n\n", sep="")
	cat("Deviance Residuals: \n")
	if(x$df.residual > 5) {
		x$deviance.resid <- quantile(x$deviance.resid)
		names(x$deviance.resid) <- c("Min", "1Q", "Median", "3Q", "Max")
	}
	print.default(x$deviance.resid, digits=digits, na = "", print.gap = 2)
	cat("\nCoefficients:\n")
	print.default(x$coefficients, digits=digits, print.gap = 2)
	cat(paste("\n(Dispersion parameter for ", x$family$family,
		" family taken to be ", x$dispersion,
		")\n\n    Null deviance: ", x$null.deviance,
		" on ", x$df.null, " degrees of freedom\n\n",
		"Residual deviance: ", x$deviance,
		" on ", x$df.residual, " degrees of freedom\n\n",
		"Number of Fisher Scoring iterations: ", x$iter,
		"\n\n", sep=""))

	correl <- x$correlation
	if(!is.null(correl)) {
		p <- dim(correl)[2]
		if(p > 1) {
			cat("Correlation of Coefficients:\n")
			correl[!lower.tri(correl)] <- NA
			print(correl[-1, -NCOL(correl), drop=FALSE],
				digits=digits, na="")
		}
		cat("\n")
	}
	invisible(x)
}


print.anova.glm <- function(x, digits = max(3, .Options$digits - 3),
	na.print = "", ...)
{
	cat("\n", x$title, sep="")
	print.default(x$table, digits=digits, na = "", print.gap = 2)
	cat("\n")
}

# Generic Functions

coefficients.glm <- function(object)
	object$coefficients


deviance.glm <- function(object)
	object$deviance


effects.glm <- function(object)
	object$effects


family.glm <- function(object) {
	family <- get(as.character(object$family$family), mode="function")
	family()
}


fitted.values.glm <- function(object)
	object$fitted.values


residuals.glm <- function(object, type="deviance")
{
	type <- match(type, c("deviance", "pearson", "working", "response"))
	y <- object$y
	mu <- object$fitted.values
	wts <- object$prior.weights
	switch(type,
		deviance={
			dev.resids <- object$family$dev.resids
			ifelse(y>mu, dev.resids(y, mu, wts)^0.5, -(dev.resids(
				y, mu, wts)^0.5))
		},
		pearson=object$residuals * object$weights^0.5,
		working=object$residuals,
		response=y - mu
		)
}
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
r-testers mailing list -- For info or help, send "info" or "help",
To [un]subscribe, send "[un]subscribe"
(in the "body", not the subject !)  To: r-testers-request@stat.math.ethz.ch
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-