a brief overview of what is done in the code.

- the input file contains 120*500*61 cells. 120*500 rows and 61 columns.
- we need to import the cells in 500 at a time and perform the same operations on each sub group
- the file contais numeric values. there are quite a lot of missing values. this has been coded as NA in the text file (the file that is imported)
- for each variable we check for outliers. this is done by setting all values that are greater than 3 standard deviations (sd) from the mean of a variable to be equal to the 3 sd value.
- the data set has one response variable , the first column, and 60 explanatory variables.
- we regress each of the explanatory variables against the response and record the slope of the explanatory variable. (i.e. simple linear regression is performed)
- nsize = 500 since we import 500 rows at a time
- nruns = how many groups you want to run the analysis on

TRY<-function(nsize=500,filename="C:/A.txt",nvar=61,nruns=1) {

#the matrix with the payoff weights

fit.reg<-matrix(nrow=nruns,ncol=nvar-1)

for (ii in 1:nruns)

{

skip=1+(ii-1)*nsize

#import the data in batches of "nsize*nvar"

#save as a matrix and then delete "dscan" to save memory space

dscan<-scan(file=filename,sep="\t",skip=skip,nlines=nsize,fill=T,quiet=T)

dm<-matrix(dscan,nrow=nsize,byrow=T) rm(dscan)

#this calculates which of the columns have entries in the columns#that are not NA#only perform regressions on those with more than 2 data points

#obviously the number of points has to be much larger than 2

#col.points = the number of points in the column that are not NA

col.points<-apply(dm,2,function(x) sum(match(x,rep(NA,nsize),nomatch=0))) col.points

#adjust for outliers

dm.new<-dm mean.dm.new<-apply(dm.new,2,function(x) mean(x,na.rm=T)) sd.dm.new<-apply(dm.new,2,function(x) sd(x,na.rm=T)) top.dm.new<-mean.dm.new+3*sd.dm.new bottom.dm.new<-mean.dm.new-3*sd.dm.new for (i in 1:nvar) { dm.new[,i][dm.new[,i]>top.dm.new[i]]<-top.dm.new[i] dm.new[,i][dm.new[,i]<bottom.dm.new[i]]<-bottom.dm.new[i] }

#standardize the variables

#we dont have to change the variable names here but i did!

means.dm.new<-apply(dm.new,2,function(x) mean(x,na.rm=T)) std.dm.new<-apply(dm.new,2,function(x) sd(x,na.rm=T)) dm.new<-sweep(sweep(dm.new,2,means.dm.new,"-"),2,std.dm.new,"/") for (j in 2:nvar) { 'WE DO NOT PERFORM THE REGRESSION IF ALL VALUES IN THE COLUMN ARE "NA" if (col.points[j]!=nsize) { #fit the regression equations fit.reg[ii,j-1]<-summary(lm(dm.new[,1]~dm.new[,j]))$coef[2,1] } else fit.reg[ii,j-1]<-"L" }

}

dm.names<-scan(file=filename,sep="\t",skip=0,nlines=1,fill=T,quiet=T,what="charachter") dm.names<-matrix(dm.names,nrow=1,ncol=nvar,byrow=T) colnames(fit.reg)<-dm.names[-1]

output<-c("$fit.reg")

list(fit.reg=fit.reg,output=output)

}

a=TRY(nsize=500,filename="C:/A.txt",nvar=61,nruns=1)

