[Rd] Minor logical bug in rbind.data.frame ?

From: Steven McKinney <smckinney_at_bccrc.ca>
Date: Fri 12 Jan 2007 - 22:18:10 GMT

When attempting to merge 3 data frames, one of which has fewer columns than the others, rbind.data.frame correctly refuses to perform the bind. However, the error message given is a bit obscure due to a logical bug in the match.names() internal function to rbind.data.frame.

Illustration:

## Three data frames with same column variable names:
> foo <- data.frame(v1 = c('a', 'b'), v2 = c(1, 2), v3 = ordered(c('x', 'y')))
> bar <- data.frame(v2 = c(3, 4), v3 = ordered(c('x', 'y')), v1 = c('c', 'd'))
> baz <- data.frame(v1 = c('a', 'e'), v2 = c(5, 6), v3 = ordered(c('x', 'z')))
> rbind(foo, bar, baz)

  v1 v2 v3
1 a 1 x
2 b 2 y
3 c 3 x
4 d 4 y
5 a 5 x
6 e 6 z
> ## All is fine.

## Third data frame has a different third column variable
> fifi <- data.frame(v1 = c('a', 'e'), v2 = c(9, 10), v4 = ordered(c('y', 'z')))
> rbind(foo, bar, fifi)

Error in match.names(clabs, names(xi)) : names do not match previous names:

        v4
> ## Output looks fine.

## Third data frame is missing a column.
> gaga <- data.frame(v1 = c('a', 'e'), v2 = c(7, 8))
> rbind(foo, bar, gaga)

Error in paste(nmi[nii == 0], collapse = ", ") :

        object "nii" not found
In addition: Warning message:
longer object length

        is not a multiple of shorter object length in: clabs == nmi
>

This somewhat cryptic error message results because the second portion of the second if() test clause in the match.names() function is not evaluated, as it is the second argument to function '&&' and the first argument (length(nmi) == length(clabs)) evaluates to FALSE. Thus variable nii is not defined and the stop() command at the end of match.names() throws an (unintended?) error.

If I switch the order of these two arguments to '&&' (see my test function sm.rbind.data.frame below) I get what appears to be the intended output given an input data frame with a missing column (relative to the prior data frames):

> sm.rbind.data.frame(foo, bar, gaga)
Error in match.names(clabs, names(xi)) : names do not match previous names:         

In addition: Warning message:
longer object length

        is not a multiple of shorter object length in: clabs == nmi

Is this a possible fix for rbind.data.frame, or would the changed order break something else?

## Modified function handles all the above test cases properly
> sm.rbind.data.frame(foo, bar, baz)
  v1 v2 v3
1 a 1 x
2 b 2 y
3 c 3 x
4 d 4 y
5 a 5 x
6 e 6 z
> sm.rbind.data.frame(foo, bar, fifi)
Error in match.names(clabs, names(xi)) : names do not match previous names:

        v4
> sm.rbind.data.frame(foo, bar, gaga)
Error in match.names(clabs, names(xi)) : names do not match previous names:         

In addition: Warning message:
longer object length

        is not a multiple of shorter object length in: clabs == nmi
>

### rbind.data.frame with modified match.names() sm.rbind.data.frame <-
function (..., deparse.level = 1)
{

    match.names <- function(clabs, nmi) {

        if (all(clabs == nmi)) 
            NULL
### Switched order of args to '&&' in following if() test clause
        else if (all(nii <- match(nmi, 
            clabs, 0)) && length(nmi) == length(clabs)) {
            m <- pmatch(nmi, clabs, 0)
            if (any(m == 0)) 
                stop("names do not match previous names")
            m
        }
        else stop("names do not match previous names:\n\t", paste(nmi[nii == 
            0], collapse = ", "))

    }
    Make.row.names <- function(nmi, ri, ni, nrow) {
        if (nchar(nmi) > 0) {
            if (ni == 0) 
                character(0)
            else if (ni > 1) 
                paste(nmi, ri, sep = ".")
            else nmi
        }
        else if (nrow > 0 && identical(ri, 1:ni)) 
            as.integer(seq.int(from = nrow + 1, length = ni))
        else ri

    }
    allargs <- list(...)
    allargs <- allargs[sapply(allargs, length) > 0]     n <- length(allargs)
    if (n == 0)

        return(structure(list(), class = "data.frame", row.names = integer()))     nms <- names(allargs)
    if (is.null(nms))

        nms <- character(length(allargs))     cl <- NULL
    perm <- rows <- rlabs <- vector("list", n)     nrow <- 0
    value <- clabs <- NULL
    all.levs <- list()
    for (i in 1:n) {

        xi <- allargs[[i]]
        nmi <- nms[i]
        if (is.matrix(xi)) 
            allargs[[i]] <- xi <- as.data.frame(xi)
        if (inherits(xi, "data.frame")) {
            if (is.null(cl)) 
                cl <- oldClass(xi)
            ri <- attr(xi, "row.names")
            ni <- length(ri)
            if (is.null(clabs)) 
                clabs <- names(xi)
            else {
                pi <- match.names(clabs, names(xi))
                if (!is.null(pi)) 
                  perm[[i]] <- pi
            }
            rows[[i]] <- seq.int(from = nrow + 1, length = ni)
            rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
            nrow <- nrow + ni
            if (is.null(value)) {
                value <- unclass(xi)
                nvar <- length(value)
                all.levs <- vector("list", nvar)
                has.dim <- logical(nvar)
                facCol <- logical(nvar)
                ordCol <- logical(nvar)
                for (j in 1:nvar) {
                  xj <- value[[j]]
                  if (!is.null(levels(xj))) {
                    all.levs[[j]] <- levels(xj)
                    facCol[j] <- TRUE
                  }
                  else facCol[j] <- is.factor(xj)
                  ordCol[j] <- is.ordered(xj)
                  has.dim[j] <- length(dim(xj)) == 2
                }
            }
            else for (j in 1:nvar) {
                xij <- xi[[j]]
                if (is.null(pi) || is.na(jj <- pi[[j]])) 
                  jj <- j
                if (facCol[jj]) {
                  if (length(lij <- levels(xij)) > 0) {
                    all.levs[[jj]] <- unique(c(all.levs[[jj]], 
                      lij))
                    ordCol[jj] <- ordCol[jj] & is.ordered(xij)
                  }
                  else if (is.character(xij)) 
                    all.levs[[jj]] <- unique(c(all.levs[[jj]], 
                      xij))
                }
            }
        }
        else if (is.list(xi)) {
            ni <- range(sapply(xi, length))
            if (ni[1] == ni[2]) 
                ni <- ni[1]
            else stop("invalid list argument: all variables should have the same length")
            rows[[i]] <- ri <- as.integer(seq.int(from = nrow + 
                1, length = ni))
            nrow <- nrow + ni
            rlabs[[i]] <- Make.row.names(nmi, ri, ni, nrow)
            if (length(nmi <- names(xi)) > 0) {
                if (is.null(clabs)) 
                  clabs <- nmi
                else {
                  tmp <- match.names(clabs, nmi)
                  if (!is.null(tmp)) 
                    perm[[i]] <- tmp
                }
            }
        }
        else if (length(xi) > 0) {
            rows[[i]] <- nrow <- nrow + 1
            rlabs[[i]] <- if (nchar(nmi) > 0) 
                nmi
            else as.integer(nrow)
        }

    }
    nvar <- length(clabs)
    if (nvar == 0)

        nvar <- max(sapply(allargs, length))     if (nvar == 0)

        return(structure(list(), class = "data.frame", row.names = integer()))     pseq <- 1:nvar
    if (is.null(value)) {

        value <- list()
        value[pseq] <- list(logical(nrow))
    }
    names(value) <- clabs
    for (j in 1:nvar) if (length(lij <- all.levs[[j]]) > 0)

        value[[j]] <- factor(as.vector(value[[j]]), lij, ordered = ordCol[j])     if (any(has.dim)) {

        rmax <- max(unlist(rows))
        for (i in (1:nvar)[has.dim]) if (!inherits(xi <- value[[i]], 
            "data.frame")) {
            dn <- dimnames(xi)
            rn <- dn[[1]]
            if (length(rn) > 0) 
                length(rn) <- rmax
            pi <- dim(xi)[2]
            length(xi) <- rmax * pi
            value[[i]] <- array(xi, c(rmax, pi), list(rn, dn[[2]]))
        }

    }
    for (i in 1:n) {
        xi <- unclass(allargs[[i]])
        if (!is.list(xi)) 
            if (length(xi) != nvar) 
                xi <- rep(xi, length.out = nvar)
        ri <- rows[[i]]
        pi <- perm[[i]]
        if (is.null(pi)) 
            pi <- pseq
        for (j in 1:nvar) {
            jj <- pi[j]
            xij <- xi[[j]]
            if (has.dim[jj]) {
                value[[jj]][ri, ] <- xij
                rownames(value[[jj]])[ri] <- rownames(xij)
            }
            else {
                value[[jj]][ri] <- if (is.factor(xij)) 
                  as.vector(xij)
                else xij
                if (!is.null(nm <- names(xij))) 
                  names(value[[jj]])[ri] <- nm
            }
        }

    }
    rlabs <- unlist(rlabs)
    if (any(duplicated(rlabs)))

        rlabs <- make.unique(as.character(unlist(rlabs)), sep = "")     if (is.null(cl)) {

        as.data.frame(value, row.names = rlabs)     }
    else {

        class(value) <- cl
        attr(value, "row.names") <- rlabs
        value

    }
}

Steven McKinney

Statistician
Molecular Oncology and Breast Cancer Program British Columbia Cancer Research Centre

email: smckinney@bccrc.ca

tel: 604-675-8000 x7561

BCCRC
Molecular Oncology
675 West 10th Ave, Floor 4
Vancouver B.C.
V5Z 1L3
Canada



R-devel@r-project.org mailing list
https://stat.ethz.ch/mailman/listinfo/r-devel Received on Sat Jan 13 09:19:53 2007

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.1.8, at Fri 12 Jan 2007 - 23:31:05 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-devel. Please read the posting guide before posting to the list.