[R] Recursive-SVM (R-SVM)

From: Nair, Murlidharan T <mnair_at_iusb.edu>
Date: Mon 22 Jan 2007 - 20:50:20 GMT


I am trying to implement a simple r-svm example using the iris data (only two of the classes are taken and data is within the code). I am running into some errors. I am not an expert on svm's. If any one has used it, I would appreciate their help. I am appending the code below. Thanks../Murli  

#######################################################
### R-code for R-SVM

### use leave-one-out / Nfold or bootstrape to permute data for external CV

### build SVM model and use mean-balanced weight to sort genes on training set

### and recursive elimination of least important genes

### author: Dr. Xin Lu, Research Scientist

### Biostatistics Department, Harvard School of Public Health

library(e1071)

## read in SVM formated data in filename

## the format following the defination of SVMTorch

## the first line contains 2 integer: nSample nFeature+1

## followed by a matrix, each row for one sample, with the last column being +/1 1 for class label

ReadSVMdata <- function(filename)

{

dd <- read.table( filename, header=F, skip=1)

x <- as.matrix( dd[, 1:(ncol(dd)-1)] )

y <- factor( dd[, ncol(dd)] )

ret <- list(x=x, y=y)

}

## create a decreasing ladder for recursive feature elimination

CreatLadder <- function( Ntotal, pRatio=0.75, Nmin=5 )

{

x <- vector()

x[1] <- Ntotal

for( i in 1:100 )

{

pp <- round(x[i] * pRatio)

if( pp == x[i] )

{

pp <- pp-1

}

if( pp >= Nmin )

{

x[i+1] <- pp

} else

{

break

}

}

x

}

## R-SVM core code

## input:

## x: row matrix of data

## y: class label: 1 / -1 for 2 classes

## CVtype:

## integer: N fold CV

## "LOO": leave-one-out CV

## "bootstrape": bootstrape CV

## CVnum: number of CVs

## LOO: defined as sample size

## Nfold and bootstrape: user defined, default as sample size

## output: a named list

## Error: a vector of CV error on each level

## SelFreq: a matrix for the frequency of each gene being selected in each level

## with each column corresponds to a level of selection

## and each row for a gene

## The top important gene in each level are those high-freqent ones

RSVM <- function(x, y, ladder, CVtype, CVnum=0 )

{

## check if y is binary response

Ytype <- names(table(y))

if( length(Ytype) != 2)

{

print("ERROR!! RSVM can only deal with 2-class problem")

return(0)

}

## class mean

m1 <- apply(x[ which(y==Ytype[1]), ], 2, mean)

m2 <- apply(x[ which(y==Ytype[2]), ], 2, mean)

md <- m1-m2

yy <- vector( length=length(y))

yy[which(y==Ytype[1])] <- 1

yy[which(y==Ytype[2])] <- -1

y <- yy

## check ladder

if( min(diff(ladder)) >= 0 )

{

print("ERROR!! ladder must be monotonously decreasing")

return(0);

}

if( ladder[1] != ncol(x) )

{

ladder <- c(ncol(x), ladder)

}

nSample <- nrow(x)

nGene <- ncol(x)

SampInd <- seq(1, nSample)

if( CVtype == "LOO" )

{

CVnum <- nSample

} else

{

if( CVnum == 0 )

{

CVnum <- nSample

}

}

## vector for test error and number of tests

ErrVec <- vector( length=length(ladder))

names(ErrVec) <- paste("Lev_", ladder, sep="")

nTests <- 0

SelFreq <- matrix( 0, nrow=nGene, ncol=length(ladder))

colnames(SelFreq) <- paste("Lev_", ladder, sep="")

## for each CV

for( i in 1:CVnum )

{

## split data

if( CVtype == "LOO" )

{

TestInd <- i

TrainInd <- SampInd[ -TestInd]

} else

{

if( CVtype == "bootstrape" )

{

TrainInd <- sample(SampInd, nSample, replace=T )

TestInd <- SampInd[ which(!(SampInd %in% TrainInd ))]

} else

{

## Nfold

TrainInd <- sample(SampInd, nSample*(CVtype-1)/CVtype )

TestInd <- SampInd[ which(!(SampInd %in% TrainInd ))]

}

}

nTests <- nTests + length(TestInd)

## in each level, train a SVM model and record test error

xTrain <- x[TrainInd, ]

yTrain <- y[TrainInd]

xTest <- x[TestInd,]

yTest <- y[TestInd]

## index of the genes used in the

SelInd <- seq(1, nGene)

for( gLevel in 1:length(ladder) )

{

## record the genes selected in this ladder

SelFreq[SelInd, gLevel] <- SelFreq[SelInd, gLevel] +1

## train SVM model and test error

svmres <- svm(xTrain[, SelInd], yTrain, scale=F, type="C-classification", kernel="linear" )

if( CVtype == "LOO" )

{

svmpred <- predict(svmres, matrix(xTest[SelInd], nrow=1) )

} else

{

svmpred <- predict(svmres, xTest[, SelInd] )

}

ErrVec[gLevel] <- ErrVec[gLevel] + sum(svmpred != yTest )

## weight vector

W <- t(svmres$coefs*yTrain[svmres$index]) %*% svmres$SV * md[SelInd]

rkW <- rank(W)

if( gLevel < length(ladder) )

{

SelInd <- SelInd[which(rkW > (ladder[gLevel] - ladder[gLevel+1]))]

}

}

}

ret <- list(ladder=ladder, Error=ErrVec/nTests, SelFreq=SelFreq)

}

SummaryRSVM <- function( RSVMres )

{

ERInd <- max( which(RSVMres$Error == min(RSVMres$Error)) )

MinLevel <- RSVMres$ladder[ERInd]

FreqVec <- RSVMres$SelFreq[, ERInd]

SelInd <- which( rank(FreqVec) >= (ladder[1]-MinLevel) )

# print("MinCV error of", min(RSVMres$Error), "at", MinLevel, "genes" )

ret <- list( MinER=min(RSVMres$Error), MinLevel=MinLevel, SelInd=SelInd)

}

###########################################

#my code starts below

#data<-ReadSVMdata("iris_r-svm.txt")

#The data read from the file is given below.

data<-structure(list(x = structure(c(5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6,

5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1,

5.4, 5.1, 4.6, 5.1, 4.8, 5, 5, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2,

5.5, 4.9, 5, 5.5, 4.9, 4.4, 5.1, 5, 4.5, 4.4, 5, 5.1, 4.8, 5.1,

4.6, 5.3, 5, 7, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2,

5, 5.9, 6, 6.1, 5.6, 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3,

6.1, 6.4, 6.6, 6.8, 6.7, 6, 5.7, 5.5, 5.5, 5.8, 6, 5.4, 6, 6.7,

6.3, 5.6, 5.5, 5.5, 6.1, 5.8, 5, 5.6, 5.7, 5.7, 6.2, 5.1, 5.7,

3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3,

3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3, 3.4,

3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.6, 3, 3.4,

3.5, 2.3, 3.2, 3.5, 3.8, 3, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1,

2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2, 3, 2.2, 2.9, 2.9, 3.1,

3, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3, 2.8, 3, 2.9, 2.6,

2.4, 2.4, 2.7, 2.7, 3, 3.4, 3.1, 2.3, 3, 2.5, 2.6, 3, 2.6, 2.3,

2.7, 3, 2.9, 2.9, 2.5, 2.8, 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4,

1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5,

1.7, 1.5, 1, 1.7, 1.9, 1.6, 1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5,

1.4, 1.5, 1.2, 1.3, 1.4, 1.3, 1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4,

1.6, 1.4, 1.5, 1.4, 4.7, 4.5, 4.9, 4, 4.6, 4.5, 4.7, 3.3, 4.6,

3.9, 3.5, 4.2, 4, 4.7, 3.6, 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4,

4.9, 4.7, 4.3, 4.4, 4.8, 5, 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5,

4.5, 4.7, 4.4, 4.1, 4, 4.4, 4.6, 4, 3.3, 4.2, 4.2, 4.2, 4.3,

3, 4.1, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,

0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.4, 0.2, 0.5,

0.2, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.1, 0.2, 0.2, 0.2, 0.2,

0.1, 0.2, 0.2, 0.3, 0.3, 0.2, 0.6, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2,

1.4, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6, 1, 1.3, 1.4, 1, 1.5, 1, 1.4,

1.3, 1.4, 1.5, 1, 1.5, 1.1, 1.8, 1.3, 1.5, 1.2, 1.3, 1.4, 1.4,

1.7, 1.5, 1, 1.1, 1, 1.2, 1.6, 1.5, 1.6, 1.5, 1.3, 1.3, 1.3,

1.2, 1.4, 1.2, 1, 1.3, 1.2, 1.3, 1.3, 1.1, 1.3), .Dim = c(100,

4), .Dimnames = list(c("1", "2", "3", "4", "5", "6", "7", "8",

"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",

"20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",

"31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41",

"42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52",

"53", "54", "55", "56", "57", "58", "59", "60", "61", "62", "63",

"64", "65", "66", "67", "68", "69", "70", "71", "72", "73", "74",

"75", "76", "77", "78", "79", "80", "81", "82", "83", "84", "85",

"86", "87", "88", "89", "90", "91", "92", "93", "94", "95", "96",

"97", "98", "99", "100"), c("V1", "V2", "V3", "V4"))), y = structure(c(2,

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), .Label = c("-1",

"1"), class = "factor")), .Names = c("x", "y"))

len<-length(data$y)

x<-data$x

y<-data$y

ladder<-CreatLadder(len)

RSVM(x,y,ladder,"LOO")



R-help@stat.math.ethz.ch mailing list
https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code. Received on Tue Jan 23 07:55:27 2007

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.1.8, at Mon 22 Jan 2007 - 21:30:33 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help. Please read the posting guide before posting to the list.