refnorm.file<-function(workdir, # Name of directory with all data files input.file, # Name of input signal file chrom.file, # Name of reference chromosome file ann.file, # Name of annotation file output.file, # Name of output signal file chrom.type, # "include" or "exclude" chromosomes in chrom file sex.chrom=c(23:24,"X","Y"), # Sex chromosome, will be excluded from input.marker.col=1, # Name or index of marker column in input signal file ann.marker.col=1, # Name or index of marker column in annotation file ann.chrom.col=2, # Name or index of chromosome column in annotation file chrom.file.IDcol=1, # Name or index of sample ID column in chromosome file chrom.file.chromcol=2, # Name or index of chromosome list column in chromosome file trim.ref=T, # Indicates whether to trim the reference set qtarget=qlnorm, # Quantile function of the target distribution target.p1=0, # first parameter of quantile function target.p2=1, # second parameter of quantile function temp.prefix="tempnorm", # temporary file name prefix max.datapts.mem=1e7, # maximum number of data points to read into memory at a time ndigits=4) # No. of digits to round result to (reduce file size) { # Generate full file name strings workdir.end<-substring(workdir,nchar(workdir)) if (workdir.end=="\\") workdir.sep="" else workdir.sep="\\" input.file<-paste(workdir,input.file,sep=workdir.sep) chrom.file<-paste(workdir,chrom.file,sep=workdir.sep) ann.file<-paste(workdir,ann.file,sep=workdir.sep) output.file<-paste(workdir,output.file,sep=workdir.sep) #temp.prefix<-paste(workdir,temp.prefix,sep=workdir.sep) # Read in annotation and reference chromosome data ann.data<-read.table(ann.file,as.is=T,header=T,sep="\t") chrom.data<-read.table(chrom.file,as.is=T,header=T,sep="\t") # Check for matches in chromosome file and input signal file chrom.IDs<-make.names(chrom.data[,chrom.file.IDcol]) signal.hdrs<-scan(input.file,sep="\n",what=character(),n=1) signal.hdrs<-make.names(unlist(strsplit(signal.hdrs,split="\t"))) n.hdrs<-length(signal.hdrs) n.mtch<-rep(0,n.hdrs) for (i in 1:n.hdrs) { n.mtch[i]<-sum(is.element(chrom.IDs,signal.hdrs[i])) } print(paste("No. of Input Data Columns Matched to Chromosome File:", sum(n.mtch>0))) print(paste("No. of Input Data Columns Matched to More than One Row of Chromosome File:", sum(n.mtch>1))) print("No. Chromosome File Matches for Each Input Data Column:") print(paste(signal.hdrs,n.mtch,sep=": ")) if (any(n.mtch>1)) { print("Error: Some Input Data File Column Matched to More Than One Row of Chromosome File.") return("Error: Some Input Data File Column Matched to More Than One Row of Chromosome File.") } # Preview size of input signal data file print(paste("Checking Size of Input Signal File:",date())) fld.cnt<-count.fields(input.file,sep="\t") nrows.data<-length(fld.cnt) ncols.data<-median(fld.cnt) error<-any(fld.cnt!=ncols.data) if (error) { print("Error: Input file has unequal number of delimiters across rows.") return("Error: Input file has unequal number of delimiters across rows.") } # Some Preliminary work on reference chromosome matching sexmark<-is.element(ann.data[,ann.chrom.col],sex.chrom) inc.chrom<-(substring(chrom.type,1,1)=="i") # Determine if data-splitting is required need.split<-((nrows.data*ncols.data)>max.datapts.mem) if (need.split) { print("Deterimined That Data Set Must Be Split to Avoid Memory Overload.") ncols.per.file<-max.datapts.mem%/%nrows.data rows.per.time<-max.datapts.mem%/%(3*ncols.data) nfiles<-ncols.data%/%ncols.per.file+1 tempfiles<-paste(temp.prefix,0:nfiles,".txt",sep="") print("Intermediate Results Will Be Written to These Temporary Files:") print(tempfiles) tempfiles<-paste(workdir,tempfiles,sep=workdir.sep) vcut.file(input.file,input.marker.col,tempfiles[1],rows.per.time) probe.names<-scan(tempfiles[1],what=character(),skip=1,sep="\n") on.sex.chr<-is.element(probe.names,ann.data[sexmark,ann.marker.col]) col1<-1 coln<-min(col1+ncols.per.file-1,ncols.data) # Split the Data File into temporary subunits for (i in 1:nfiles) { coln<-min(col1+ncols.per.file-1,ncols.data) print(paste("Generating",substring(tempfiles[i+1],nchar(workdir)),date())) vcut.file(input.file,col1:coln,tempfiles[i+1],rows.per.time) col1<-coln+1 temp.cnt<-count.fields(tempfiles[i+1]) print(length(temp.cnt)) } # Read, normalize, and overwrite each subunit for (i in 1:nfiles) { print(paste("Reading",substring(tempfiles[i+1],nchar(workdir)),date())) temp.data<-read.table(tempfiles[i+1],sep="\t",as.is=T,header=T) print(dim(temp.data)) print(paste("Normalizing",substring(tempfiles[i+1],nchar(workdir)),date())) temp.hdrs<-names(temp.data) ncol.temp<-length(temp.hdrs) temp.normind<-(1:ncol.temp)[is.element(temp.hdrs,chrom.IDs)] for (j in temp.normind) { print(paste("Normalizing ",temp.hdrs[j],": ",date())) mtch.chrom.row<-(chrom.IDs==temp.hdrs[j]) #print(paste("Matched Chromosome Data:",mtch.chrom.row)) chrom.list<-chrom.data[mtch.chrom.row,chrom.file.chromcol] print(paste(" Chrom List:",chrom.list)) chrom.list<-unlist(strsplit(chrom.list,split=" ")) chrom.list<-unlist(strsplit(chrom.list,split=",")) chrom.ind<-is.element(ann.data[,ann.chrom.col],chrom.list) mark.on.chrom<-is.element(probe.names, ann.data[chrom.ind,ann.marker.col]) ref.mark<-((mark.on.chrom&inc.chrom)|(!mark.on.chrom&!inc.chrom))&!on.sex.chr print(paste(" No. Initial Reference Markers:",sum(ref.mark))) y<-temp.data[,j] res<-ref.norm(y,ref.mark,trim.ref,qtarget,target.p1,target.p2) print(paste(" No. Final Reference Markers:",sum(res$ref.trim))) temp.data[,j]<-round(res$y.norm,ndigits) } n.wrt<-0 row1<-1 while(row1<=(nrows.data-1)) { rown<-min(row1+rows.per.time-1,nrows.data-1) print(paste("Writing Normalized Data Rows",row1,"-",rown,":",date())) X<-temp.data[row1:rown,] write.table(X,tempfiles[i+1],append=(n.wrt!=0),sep="\t", quote=F,row.names=F,col.names=(n.wrt==0)) n.wrt<-n.wrt+1 row1<-rown+1 } } print(paste("Combining Temporary Files into Final Output File:",date())) vpaste.files(tempfiles[2:(nfiles+1)],output.file,rows.per.time%/%nfiles) print(paste("Deleting Temporary Files.")) file.remove(tempfiles) print(paste("Done.",date())) } else { print("Determined Full Input Data File Can Be Read into Memory.") input.data<-read.table(input.file,sep="\t",as.is=T,header=T,nrows=nrows.data+10) print(paste("Input Signal Data File Read.")) probe.names<-input.data[,input.marker.col] on.sex.chr<-is.element(probe.names,ann.data[sexmark,ann.marker.col]) norm.col.ind<-(1:ncols.data)[n.mtch>0] for (i in norm.col.ind) { print(paste("Normalizing ",signal.hdrs[i],": ",date())) #print(chrom.IDs) mtch.chrom.row<-(chrom.IDs==signal.hdrs[i]) #print(mtch.chrom.row) chrom.list<-chrom.data[mtch.chrom.row,chrom.file.chromcol] print(paste(" Chrom List:",chrom.list)) chrom.list<-unlist(strsplit(chrom.list,split=" ")) chrom.list<-unlist(strsplit(chrom.list,split=",")) chrom.ind<-is.element(ann.data[,ann.chrom.col],chrom.list) mark.on.chrom<-is.element(probe.names, ann.data[chrom.ind,ann.marker.col]) ref.mark<-((mark.on.chrom&inc.chrom)|(!mark.on.chrom&!inc.chrom))&!on.sex.chr print(paste(" No. Initial Reference Markers:",sum(ref.mark))) y<-input.data[,i] res<-ref.norm(y,ref.mark,trim.ref,qtarget,target.p1,target.p2) print(paste(" No. Final Reference Markers:",sum(res$ref.trim))) input.data[,i]<-round(res$y.norm,ndigits) } n.wrt<-0 row1<-1 rows.per.time<-max.datapts.mem%/%(3*ncols.data) while(row1<=(nrows.data-1)) { rown<-min(row1+rows.per.time-1,(nrows.data-1)) print(paste("Writing Normalized Data Rows",row1,"-",rown,":",date())) X<-input.data[row1:rown,] write.table(X,output.file,append=(n.wrt!=0),sep="\t", quote=F,row.names=F,col.names=(n.wrt==0)) n.wrt<-n.wrt+1 row1<-rown+1 } } } ref.norm<-function(y, # original signal vector ref.mark, # logical (T/F) vector indicating reference marker status trim.ref=T, # indicates if reference set should be trimmed qtarget=qlnorm, # quantile function of the target distribution target.p1=0, # first parameter of quantile function target.p2=1) # second parameter of quantile function { m<-length(y) ref.orig<-ref.mark all.rank<-(rank(y)-0.5)/m if (trim.ref) { q1<-quantile(all.rank[ref.mark],1/4) q3<-quantile(all.rank[ref.mark],3/4) iqr<-q3-q1 upp<-q3+1.5*iqr low<-q1-1.5*iqr ref.mark[ref.mark]<-(all.rank[ref.mark]<=upp)& (all.rank[ref.mark]>=low) } m.ref<-sum(ref.mark) ref.rank<-(rank(y[ref.mark])-0.5)/m.ref ref.rank.all<-approx(c(0,all.rank[ref.mark],1), c(0,ref.rank,1),xout=all.rank)$y y.new<-qtarget(ref.rank.all,target.p1,target.p2) return(list(y.norm=y.new,ref.orig=ref.orig,ref.trim=ref.mark)) } vcut.file<-function(input.file,keep.cols,output.file,rows.per.read=5000) { # Print message print(paste("Extracting Desired Columns from",input.file,date())) # Read header and write desired columns hdr<-scan(input.file,sep="\n",n=1,what=character()) hdr<-unlist(strsplit(hdr,split="\t")) keep.hdr<-matrix(hdr[keep.cols],1) write.table(keep.hdr,output.file,append=F,sep="\t", col.names=F,row.names=F,quote=F) # Preview size of input file fld.cnt<-count.fields(input.file) nrows.data<-length(fld.cnt) # Initialize variables for loop row1<-2 rown<-min(nrows.data,row1+rows.per.read-1) # Loop to read sections of input data at a time while (row1<=nrows.data) { rown<-min(nrows.data,row1+rows.per.read-1) # Determine last row for block print(paste("Reading rows",row1-1,"-",rown-1,date())) # Print message X0<-read.table(input.file,sep="\t",as.is=T, # Read section of file header=F,skip=row1-1, nrows=rown-row1+1) X1<-X0[,keep.cols] # Subset on desired rows print(paste("Writing Desired Columns of Rows", row1-1,"-",rown-1,"to",output.file,date())) # Print message write.table(X1,output.file,sep="\t",row.names=F, # Write to output file col.names=F,quote=F,append=T) row1<-rown+1 # Increment starting row for reading } } vpaste.files<-function(infiles,outfile,rows.per.read=5000) { nfiles<-length(infiles) ncols<-rep(NA,nfiles) colind<-matrix(NA,nfiles,2) hdr<-NULL for (i in 1:nfiles) { temp<-scan(infiles[i],what=character(),sep="\n",n=1) temp<-unlist(strsplit(temp,split="\t")) ncols[i]<-length(temp) hdr<-c(hdr,temp) } colind[,1]<-c(1,1+cumsum(ncols[2:nfiles])) colind[,2]<-cumsum(ncols) print(colind) print(ncols) hdr.mtx<-matrix(hdr,1,sum(ncols)) hdr.line<-paste(hdr,collapse="\t") write.table(hdr.mtx,outfile,sep="\t",col.names=F,row.names=F,quote=F) fld.cnt<-count.fields(infiles[1]) nrows.data<-length(fld.cnt) row1<-1 rown<-min(row1+rows.per.read-1,nrows.data) while(row1<=nrows.data) { rows.to.read<-(rown-row1+1) temp0<-scan(infiles[1],what=character(),skip=row1,n=rows.to.read,sep="\n") for (i in 2:nfiles) { temp1<-scan(infiles[i],what=character(),skip=row1,n=rows.to.read,sep="\n") temp0<-paste(temp0,temp1,sep="\t") } row1<-rown+1 rown<-min(row1+rows.per.read-1,nrows.data) write(temp0,outfile,append=T) } } auto.refchrom.file<-function(prenorm.file, snpann.file, refchr.file, snpann.snpcol, snpann.chromcol, prenorm.snpcol=1, min.hz=0.15, exc.chrom=c(23:24,"X","Y"), logt.signal=T) { print(paste("Reading SNP Annotations:",date())) snpann.data<-read.table(snpann.file,sep="\t",as.is=T,header=T) prenorm.hdr<-scan(prenorm.file,sep="\n",n=1,what=character()) prenorm.hdr<-unlist(strsplit(prenorm.hdr,split="\t")) nhdr<-length(prenorm.hdr) print(paste("Reading prenorm file:",date())) prenorm.data<-read.table(prenorm.file,sep="\t",as.is=T,header=T) prenorm.snps<-prenorm.data[,prenorm.snpcol] snpanns.snps<-snpann.data[,snpann.snpcol] prenorm.inc<-is.element(prenorm.snps,snpanns.snps) if (any(!prenorm.inc)) stop("Prenorm File Includes Some SNPs not present in annotation file.") snpann.inc<-is.element(snpanns.snps,prenorm.snps) snpanns.snps<-snpanns.snps[snpann.inc] snpann.data<-snpann.data[snpann.inc,] prenorm.ord<-order(prenorm.snps) snpann.ord<-order(snpanns.snps) prenorm.data<-prenorm.data[prenorm.ord,] snpann.data<-snpann.data[snpann.ord,] if (any(snpann.data[,snpann.snpcol]!=prenorm.data[,prenorm.snpcol])) stop("SNP IDs do not match in annotation and prenorm files.") sgnl.col<-regexpr("call",prenorm.hdr)<0 sgnl.col[prenorm.snpcol]<-F n.sgnl<-sum(sgnl.col) sgnl.ind<-(1:nhdr)[sgnl.col] refchrom<-rep("0",n.sgnl) refsamp<-rep("",n.sgnl) chrom<-snpann.data[,snpann.chromcol] print(paste("Choosing Reference Chromosomes:",date())) for (i in 1:n.sgnl) { print(paste(" Working on Sample",i,"of",n.sgnl,":",date())) y<-prenorm.data[,sgnl.ind[i]] call<-prenorm.data[,sgnl.ind[i]+1] refsamp[i]<-prenorm.hdr[sgnl.ind[i]] refchrom[i]<-pick.ref(y,call,chrom,min.hz,exc.chrom,logt.signal) } res<-cbind.data.frame(sampID=refsamp,inc.chrom=refchrom) print(paste("Writing Reference Chromosome File:",date())) write.table(res,refchr.file,sep="\t",row.names=F,col.names=T) print(paste("Done with Automated Reference Selection:",date())) return(res) } pick.ref<-function(y,call,chrom,min.hz=0.1,exc.chrom=c(23,24,"X","Y"),logt.signal=T) { uchrom<-unique(chrom[!is.element(chrom,exc.chrom)]) nchrom<-length(uchrom) hz.rate<-ave.sgnl<-sd.sgnl<-rep(NA,nchrom) if (logt.signal) y<-log(y) for (i in 1:nchrom) { this.chrom<-(chrom==uchrom[i]) hz.rate[i]<-mean(call[this.chrom]=="AB") ave.sgnl[i]<-mean(y[this.chrom]) sd.sgnl[i]<-sd(y[this.chrom]) } hz.exc<-(hz.rate<=min.hz) chr.inc<-uchrom[!hz.exc] if (length(chr.inc)==0) return("0") ave.inc<-ave.sgnl[!hz.exc] sd.inc<-sd.sgnl[!hz.exc] rnk.ave<-rank(ave.inc) rnk.sd<-rank(sd.inc) rnk.sum<-rnk.ave+rnk.sd min.sum<-rnk.sum==min(rnk.sum) chr.pick<-chr.inc[min.sum] if (length(chr.pick)>1) chr.list<-paste(chr.pick,collapse=",") else chr.list<-as.character(chr.pick) return(chr.list) } annsubset.file<-function(workdir, input.file, ann.file, output.file, input.marker.col, ann.marker.col, max.datapts.mem=1e7) { # Generate full file name strings workdir.end<-substring(workdir,nchar(workdir)) if (workdir.end=="\\") workdir.sep="" else workdir.sep="\\" input.file<-paste(workdir,input.file,sep=workdir.sep) ann.file<-paste(workdir,ann.file,sep=workdir.sep) output.file<-paste(workdir,output.file,sep=workdir.sep) # Read annotation data into memory ann.data<-read.table(ann.file,as.is=T,header=T,sep="\t") ann.snps<-ann.data[,ann.marker.col] # Determine size of input file fld.cnt<-count.fields(input.file,sep="\t") if (sum(fld.cnt)0) rep.rown<-rown-(nrd>0) print(paste("Writing Desired Rows among rows",rep.row1,"-",rep.rown,date())) write.table(X1,output.file,sep="\t",row.names=F, col.names=(nrd==0),quote=F,append=T) nrd<-nrd+1 row1<-rown+1+(nrd==1) } } }