[R] efficient code. how to reduce running time?

From: miraceti <miraceti_at_gmail.com>
Date: Sun 21 Jan 2007 - 17:37:49 GMT


Hi,
I am new to R.
and even though I've made my code to run and do what it needs to . It is taking forever and I can't use it like this. I was wondering if you could help me find ways to fix the code to run faster.
Here are my codes..
the data set is a bunch of 0s and 1s in a data.frame. What I am doing is this.
I pick a column and make up a new column Y with values associated with that column I picked.
then I remove the column.
and see if any other columns in the data have a significant association with the Y column I've generated.
If you see anything obvious that takes a long time, any suggestions would be appreciated.
thanks a lot.

Mira



#sub function for finding index

rfind <- function(x)seq(along=x)[x*(1-x)>MAF*(1-MAF)]

#sub function for permutation test

perm.F = function(y,x,nperms,sites)
{
  maxF = c();
  for (i in 1:nperms)
  {
    F=numeric(S) #create an empty vector to store the F-values     newY=sample(y,length(y)) #permute the cancer types     newX = cbind(x, newY);
    # anova for all sites
    for ( i in sites )
    {

      a <- anova(lm(newY~factor(newX[,i])));
      F[i] <- a$`F value`[1];

    }
    MSSid <- which (F == max(F)); # index of MSS (Most Significant Site)     maxF = cbind(maxF,max(F));
  }
  maxF;
}

# set the output file

sink("/tmp/R.out.3932.100")
# load the dataset

snp = read.table(file("/tmp/msoutput.3932.100"))
#print (snp);

# pi: desired proportion of variation due to QTN
pi = 0.05;
print (paste("pi:", pi));
MAF = 0.05;
print (paste("MAF:", MAF));
# S: number of segregating sites

S = length(snp[1,]);
# N: number of samples

N = length(snp[,1]);

Dips = sample(1:N,N)
DipA = Dips[1:50]
DipB = Dips[51:100]

disnp = snp[DipA,]+snp[DipB,]
snp = as.data.frame(disnp, row.names=NULL); N = length(snp[,1]);

# get allele freq for all SNPs

allele_f <- mean(snp[,1:S])/2;
print (allele_f);
sites = rfind(allele_f);
print(sites);

# collapse sites that have perfect correlation
newsites <- sites;
for (i in 1:(length(sites)-1))
{
  for (j in (i+1):length(sites))
  {
    test = (snp[sites[i]] == snp[sites[j]])     if ( all(test) || all(!test) )
    {

      print (paste("perfect correlation with", sites[i]));
      print (paste("removing alleles", sites[j]));
      newsites <- newsites[newsites!=sites[j]];
    }
  }
}
sites <- newsites;
print(sites);

# QTN: the site nearest right to S/4

sitesid = floor(length(sites)/4);
QTNid = sites[sitesid];
QTN = snp[,QTNid];

print (paste("QTN:", names(snp[QTNid]))); print (QTN);

# remove QTN from sites

sites <- sites [ sites != QTNid ];
print(sites);
print (paste("Number of usable SNPs:", length(sites)));

# p: allele frequency of QTN

p0 = allele_f[QTNid];
p = min(p0, 1-p0);
print (paste("QTN_freq:", p));

# z: random normal deviate

z = rnorm(N, mean = 0, sd = 1);
# foreach sample give quantitative phenotype
# each row is a sample
# phenotype value depends on QTN genotype, pi, p, and z
Y <- sqrt(10-(10*pi))*z + QTN*sqrt((10*pi)/(2*p*(1-p))); snp = data.frame(cbind(snp, Y));
# anova for QTN

df=data.frame(Y=Y, QTN=factor(QTN));
QTN_a <- anova(lm(Y~QTN, data=df));

print (QTN_a);
SSB <- QTN_a$`Sum Sq`[1];
SSW <- QTN_a$`Sum Sq`[2];

QTN_PRE <- SSB / (SSB + SSW);
print (paste("var_QTN/var_tot:", QTN_PRE));

# anova for all sites

F=numeric(S) #create an empty vector to store the F-values Pval=rep(1,S) #create an empty vector to store the Pval PRE=numeric(S) #create an empty vector to store the PRE

for ( i in sites )
{
  a <- anova(lm(Y~factor(snp[,i])));
  print (a);
  F[i] <- a$`F value`[1];

  Pval[i] <- a$`Pr`[1];
  SSB <- a$`Sum Sq`[1];
  SSW <- a$`Sum Sq`[2];

  PRE[i] <- SSB / (SSB + SSW);

}
print (paste("Max F:", max(F)));
MSSid <- which (F == max(F)); # index of MSS (Most Significant Site) MSS = snp[,MSSid];
print (paste("MSS(Most Significant Site):", MSSid)); p0 = length(MSS[MSS==0])/N;
p = min(p0, 1-p0);
print (paste("assoc_freq:", p));
print (paste("assoc_var:", PRE[MSSid]));
#lets do a permutation test

Fdist <- perm.F(Y, snp[,1:S], 1000, sites); print ("permutation test maxF dist");
print (Fdist);
pvalue <- mean(Fdist>F[MSSid]);
print (paste("assoc_prob:", pvalue));

# close the output file

sink()


        [[alternative HTML version deleted]]



R-help@stat.math.ethz.ch mailing list
https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code. Received on Mon Jan 22 04:45:32 2007

Archive maintained by Robert King, hosted by the discipline of statistics at the University of Newcastle, Australia.
Archive generated by hypermail 2.1.8, at Sun 21 Jan 2007 - 22:30:33 GMT.

Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help. Please read the posting guide before posting to the list.