From: miraceti <miraceti_at_gmail.com>

Date: Sun 21 Jan 2007 - 17:37:49 GMT

#sub function for finding index

rfind <- function(x)seq(along=x)[x*(1-x)>MAF*(1-MAF)]

}

MSSid <- which (F == max(F)); # index of MSS (Most Significant Site) maxF = cbind(maxF,max(F));

}

maxF;

}

disnp = snp[DipA,]+snp[DipB,]

snp = as.data.frame(disnp, row.names=NULL); N = length(snp[,1]);

}

}

sites <- newsites;

print(sites);

QTN_PRE <- SSB / (SSB + SSW);

print (paste("var_QTN/var_tot:", QTN_PRE));

PRE[i] <- SSB / (SSB + SSW);

R-help@stat.math.ethz.ch mailing list

https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code. Received on Mon Jan 22 04:45:32 2007

Date: Sun 21 Jan 2007 - 17:37:49 GMT

Hi,

I am new to R.

and even though I've made my code to run and do what it needs to .
It is taking forever and I can't use it like this.
I was wondering if you could help me find ways to fix the code to run
faster.

Here are my codes..

the data set is a bunch of 0s and 1s in a data.frame.
What I am doing is this.

I pick a column and make up a new column Y with values associated with that
column I picked.

then I remove the column.

and see if any other columns in the data have a significant association with
the Y column I've generated.

If you see anything obvious that takes a long time, any suggestions would be
appreciated.

thanks a lot.

Mira

#sub function for finding index

rfind <- function(x)seq(along=x)[x*(1-x)>MAF*(1-MAF)]

#sub function for permutation test

perm.F = function(y,x,nperms,sites)

{

maxF = c();

for (i in 1:nperms)

{

F=numeric(S) #create an empty vector to store the F-values
newY=sample(y,length(y)) #permute the cancer types
newX = cbind(x, newY);

# anova for all sites

for ( i in sites )

{

a <- anova(lm(newY~factor(newX[,i]))); F[i] <- a$`F value`[1];

}

MSSid <- which (F == max(F)); # index of MSS (Most Significant Site) maxF = cbind(maxF,max(F));

}

maxF;

}

# set the output file

sink("/tmp/R.out.3932.100")

# load the dataset

snp = read.table(file("/tmp/msoutput.3932.100"))

#print (snp);

# pi: desired proportion of variation due to QTN

pi = 0.05;

print (paste("pi:", pi));

**MAF = 0.05;
**

print (paste("MAF:", MAF));

# S: number of segregating sites

S = length(snp[1,]);

# N: number of samples

N = length(snp[,1]);

Dips = sample(1:N,N) DipA = Dips[1:50] DipB = Dips[51:100]

disnp = snp[DipA,]+snp[DipB,]

snp = as.data.frame(disnp, row.names=NULL); N = length(snp[,1]);

# get allele freq for all SNPs

allele_f <- mean(snp[,1:S])/2;

print (allele_f);

sites = rfind(allele_f);

print(sites);

# collapse sites that have perfect correlation

newsites <- sites;

for (i in 1:(length(sites)-1))

{

for (j in (i+1):length(sites))

{

test = (snp[sites[i]] == snp[sites[j]])
if ( all(test) || all(!test) )

{

print (paste("perfect correlation with", sites[i])); print (paste("removing alleles", sites[j])); newsites <- newsites[newsites!=sites[j]];}

}

}

sites <- newsites;

print(sites);

# QTN: the site nearest right to S/4

sitesid = floor(length(sites)/4);

QTNid = sites[sitesid];

QTN = snp[,QTNid];

print (paste("QTN:", names(snp[QTNid]))); print (QTN);

# remove QTN from sites

sites <- sites [ sites != QTNid ];

print(sites);

print (paste("Number of usable SNPs:", length(sites)));

# p: allele frequency of QTN

p0 = allele_f[QTNid];

p = min(p0, 1-p0);

print (paste("QTN_freq:", p));

# z: random normal deviate

z = rnorm(N, mean = 0, sd = 1);

*# foreach sample give quantitative phenotype
*

# each row is a sample

# phenotype value depends on QTN genotype, pi, p, and z

Y <- sqrt(10-(10*pi))*z + QTN*sqrt((10*pi)/(2*p*(1-p)));
snp = data.frame(cbind(snp, Y));

# anova for QTN

df=data.frame(Y=Y, QTN=factor(QTN));

QTN_a <- anova(lm(Y~QTN, data=df));

print (QTN_a); SSB <- QTN_a$`Sum Sq`[1]; SSW <- QTN_a$`Sum Sq`[2];

QTN_PRE <- SSB / (SSB + SSW);

print (paste("var_QTN/var_tot:", QTN_PRE));

# anova for all sites

F=numeric(S) #create an empty vector to store the F-values
Pval=rep(1,S) #create an empty vector to store the Pval
PRE=numeric(S) #create an empty vector to store the PRE

for ( i in sites )

{

a <- anova(lm(Y~factor(snp[,i])));

print (a);

F[i] <- a$`F value`[1];

Pval[i] <- a$`Pr`[1]; SSB <- a$`Sum Sq`[1]; SSW <- a$`Sum Sq`[2];

PRE[i] <- SSB / (SSB + SSW);

}

print (paste("Max F:", max(F)));

MSSid <- which (F == max(F)); # index of MSS (Most Significant Site)
MSS = snp[,MSSid];

print (paste("MSS(Most Significant Site):", MSSid));
p0 = length(MSS[MSS==0])/N;

p = min(p0, 1-p0);

print (paste("assoc_freq:", p));

print (paste("assoc_var:", PRE[MSSid]));

#lets do a permutation test

Fdist <- perm.F(Y, snp[,1:S], 1000, sites);
print ("permutation test maxF dist");

print (Fdist);

pvalue <- mean(Fdist>F[MSSid]);

print (paste("assoc_prob:", pvalue));

# close the output file

sink()

[[alternative HTML version deleted]]

R-help@stat.math.ethz.ch mailing list

https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code. Received on Mon Jan 22 04:45:32 2007

Archive maintained by Robert King, hosted by
the discipline of
statistics at the
University of Newcastle,
Australia.

Archive generated by hypermail 2.1.8, at Sun 21 Jan 2007 - 22:30:33 GMT.

*
Mailing list information is available at https://stat.ethz.ch/mailman/listinfo/r-help.
Please read the posting
guide before posting to the list.
*