refnorm.file<-function(workdir,                     # Name of directory with all data files
                       input.file,                  # Name of input signal file
                       chrom.file,                  # Name of reference chromosome file
                       ann.file,                    # Name of annotation file
                       output.file,                 # Name of output signal file
                       chrom.type,                  # "include" or "exclude" chromosomes in chrom file
                       sex.chrom=c(23:24,"X","Y"),  # Sex chromosome, will be excluded from 
                       input.marker.col=1,          # Name or index of marker column in input signal file
                       ann.marker.col=1,            # Name or index of marker column in annotation file
                       ann.chrom.col=2,             # Name or index of chromosome column in annotation file
                       chrom.file.IDcol=1,          # Name or index of sample ID column in chromosome file
                       chrom.file.chromcol=2,       # Name or index of chromosome list column in chromosome file
                       trim.ref=T,                  # Indicates whether to trim the reference set
                       qtarget=qlnorm,              # Quantile function of the target distribution
                       target.p1=0,                 # first parameter of quantile function
                       target.p2=1,                 # second parameter of quantile function
                       temp.prefix="tempnorm",      # temporary file name prefix
                       max.datapts.mem=1e7,         # maximum number of data points to read into memory at a time
                       ndigits=4)                   # No. of digits to round result to (reduce file size)

                        
{
   # Generate full file name strings
   workdir.end<-substring(workdir,nchar(workdir))
   if (workdir.end=="\\") workdir.sep=""
   else workdir.sep="\\"

   input.file<-paste(workdir,input.file,sep=workdir.sep)
   chrom.file<-paste(workdir,chrom.file,sep=workdir.sep)
   ann.file<-paste(workdir,ann.file,sep=workdir.sep)
   output.file<-paste(workdir,output.file,sep=workdir.sep)
   #temp.prefix<-paste(workdir,temp.prefix,sep=workdir.sep)

   # Read in annotation and reference chromosome data
   ann.data<-read.table(ann.file,as.is=T,header=T,sep="\t")
   chrom.data<-read.table(chrom.file,as.is=T,header=T,sep="\t")
   
   # Check for matches in chromosome file and input signal file   
   chrom.IDs<-make.names(chrom.data[,chrom.file.IDcol])
   signal.hdrs<-scan(input.file,sep="\n",what=character(),n=1)
   signal.hdrs<-make.names(unlist(strsplit(signal.hdrs,split="\t")))
   n.hdrs<-length(signal.hdrs)
   n.mtch<-rep(0,n.hdrs)
   for (i in 1:n.hdrs)
   {
      n.mtch[i]<-sum(is.element(chrom.IDs,signal.hdrs[i]))
   }
   print(paste("No. of Input Data Columns Matched to Chromosome File:",
               sum(n.mtch>0)))
   print(paste("No. of Input Data Columns Matched to More than One Row of Chromosome File:",
               sum(n.mtch>1)))

   print("No. Chromosome File Matches for Each Input Data Column:")
   print(paste(signal.hdrs,n.mtch,sep=": "))
   if (any(n.mtch>1))
   {
      print("Error: Some Input Data File Column Matched to More Than One Row of Chromosome File.")
      return("Error: Some Input Data File Column Matched to More Than One Row of Chromosome File.")
   }

   # Preview size of input signal data file
   print(paste("Checking Size of Input Signal File:",date()))
   fld.cnt<-count.fields(input.file,sep="\t")
   nrows.data<-length(fld.cnt)
   ncols.data<-median(fld.cnt)
   error<-any(fld.cnt!=ncols.data)
   if (error) 
   {
      print("Error: Input file has unequal number of delimiters across rows.")
      return("Error: Input file has unequal number of delimiters across rows.")
   }
  
   # Some Preliminary work on reference chromosome matching
   sexmark<-is.element(ann.data[,ann.chrom.col],sex.chrom)
   inc.chrom<-(substring(chrom.type,1,1)=="i")

   # Determine if data-splitting is required
   need.split<-((nrows.data*ncols.data)>max.datapts.mem)
   if (need.split)
   {
       print("Deterimined That Data Set Must Be Split to Avoid Memory Overload.")
       ncols.per.file<-max.datapts.mem%/%nrows.data
       rows.per.time<-max.datapts.mem%/%(3*ncols.data)
       nfiles<-ncols.data%/%ncols.per.file+1
       tempfiles<-paste(temp.prefix,0:nfiles,".txt",sep="")
       print("Intermediate Results Will Be Written to These Temporary Files:")
       print(tempfiles)
       tempfiles<-paste(workdir,tempfiles,sep=workdir.sep)
       vcut.file(input.file,input.marker.col,tempfiles[1],rows.per.time)
       probe.names<-scan(tempfiles[1],what=character(),skip=1,sep="\n")
       on.sex.chr<-is.element(probe.names,ann.data[sexmark,ann.marker.col])

       col1<-1
       coln<-min(col1+ncols.per.file-1,ncols.data)

       # Split the Data File into temporary subunits
       for (i in 1:nfiles)
       {
          coln<-min(col1+ncols.per.file-1,ncols.data)
          print(paste("Generating",substring(tempfiles[i+1],nchar(workdir)),date()))
          vcut.file(input.file,col1:coln,tempfiles[i+1],rows.per.time)
          col1<-coln+1
          temp.cnt<-count.fields(tempfiles[i+1])
          print(length(temp.cnt))
       }
       # Read, normalize, and overwrite each subunit
       for (i in 1:nfiles)
       {
          print(paste("Reading",substring(tempfiles[i+1],nchar(workdir)),date()))
          temp.data<-read.table(tempfiles[i+1],sep="\t",as.is=T,header=T)
          print(dim(temp.data))

          print(paste("Normalizing",substring(tempfiles[i+1],nchar(workdir)),date()))
          temp.hdrs<-names(temp.data)
          ncol.temp<-length(temp.hdrs)
          temp.normind<-(1:ncol.temp)[is.element(temp.hdrs,chrom.IDs)]

          for (j in temp.normind)
          {
              print(paste("Normalizing ",temp.hdrs[j],": ",date()))
              mtch.chrom.row<-(chrom.IDs==temp.hdrs[j])
              #print(paste("Matched Chromosome Data:",mtch.chrom.row))
              chrom.list<-chrom.data[mtch.chrom.row,chrom.file.chromcol]
              print(paste("  Chrom List:",chrom.list))
              chrom.list<-unlist(strsplit(chrom.list,split=" "))
              chrom.list<-unlist(strsplit(chrom.list,split=","))
              chrom.ind<-is.element(ann.data[,ann.chrom.col],chrom.list)
              mark.on.chrom<-is.element(probe.names,
                                        ann.data[chrom.ind,ann.marker.col]) 
              ref.mark<-((mark.on.chrom&inc.chrom)|(!mark.on.chrom&!inc.chrom))&!on.sex.chr
              print(paste("  No. Initial Reference Markers:",sum(ref.mark)))
              y<-temp.data[,j]
              res<-ref.norm(y,ref.mark,trim.ref,qtarget,target.p1,target.p2)
              print(paste("  No. Final Reference Markers:",sum(res$ref.trim)))
              temp.data[,j]<-round(res$y.norm,ndigits)
          }

          n.wrt<-0
          row1<-1
          while(row1<=(nrows.data-1))
          {
              rown<-min(row1+rows.per.time-1,nrows.data-1)
              print(paste("Writing Normalized Data Rows",row1,"-",rown,":",date()))
              X<-temp.data[row1:rown,]
              write.table(X,tempfiles[i+1],append=(n.wrt!=0),sep="\t",
                          quote=F,row.names=F,col.names=(n.wrt==0))
              n.wrt<-n.wrt+1
              row1<-rown+1
          }
         
       }
       print(paste("Combining Temporary Files into Final Output File:",date()))
       vpaste.files(tempfiles[2:(nfiles+1)],output.file,rows.per.time%/%nfiles)
       print(paste("Deleting Temporary Files."))
       file.remove(tempfiles)
       print(paste("Done.",date()))

   }
   else
   {
      print("Determined Full Input Data File Can Be Read into Memory.")
      input.data<-read.table(input.file,sep="\t",as.is=T,header=T,nrows=nrows.data+10)
      print(paste("Input Signal Data File Read."))
      probe.names<-input.data[,input.marker.col]
      on.sex.chr<-is.element(probe.names,ann.data[sexmark,ann.marker.col])
      norm.col.ind<-(1:ncols.data)[n.mtch>0]
      for (i in norm.col.ind)
      {
          print(paste("Normalizing ",signal.hdrs[i],": ",date()))
          #print(chrom.IDs)
          mtch.chrom.row<-(chrom.IDs==signal.hdrs[i])
          #print(mtch.chrom.row)
          chrom.list<-chrom.data[mtch.chrom.row,chrom.file.chromcol]
          print(paste("  Chrom List:",chrom.list))
          chrom.list<-unlist(strsplit(chrom.list,split=" "))
          chrom.list<-unlist(strsplit(chrom.list,split=","))
          chrom.ind<-is.element(ann.data[,ann.chrom.col],chrom.list)
          mark.on.chrom<-is.element(probe.names,
                                    ann.data[chrom.ind,ann.marker.col])
          ref.mark<-((mark.on.chrom&inc.chrom)|(!mark.on.chrom&!inc.chrom))&!on.sex.chr
          print(paste("  No. Initial Reference Markers:",sum(ref.mark)))
          y<-input.data[,i]
          res<-ref.norm(y,ref.mark,trim.ref,qtarget,target.p1,target.p2)
          print(paste("  No. Final Reference Markers:",sum(res$ref.trim)))
          input.data[,i]<-round(res$y.norm,ndigits)
      }
      n.wrt<-0
      row1<-1
      rows.per.time<-max.datapts.mem%/%(3*ncols.data)
      while(row1<=(nrows.data-1))
      {
         rown<-min(row1+rows.per.time-1,(nrows.data-1))
         print(paste("Writing Normalized Data Rows",row1,"-",rown,":",date()))
         X<-input.data[row1:rown,]
         write.table(X,output.file,append=(n.wrt!=0),sep="\t",
                     quote=F,row.names=F,col.names=(n.wrt==0))
         n.wrt<-n.wrt+1
         row1<-rown+1
      }
   }
  
}

ref.norm<-function(y,               # original signal vector
                   ref.mark,        # logical (T/F) vector indicating reference marker status
                   trim.ref=T,      # indicates if reference set should be trimmed
                   qtarget=qlnorm,  # quantile function of the target distribution 
                   target.p1=0,     # first parameter of quantile function
                   target.p2=1)     # second parameter of quantile function

{
   m<-length(y)
   ref.orig<-ref.mark   
   all.rank<-(rank(y)-0.5)/m
   if (trim.ref)
   {
       q1<-quantile(all.rank[ref.mark],1/4)
       q3<-quantile(all.rank[ref.mark],3/4)
       iqr<-q3-q1
       upp<-q3+1.5*iqr
       low<-q1-1.5*iqr
       ref.mark[ref.mark]<-(all.rank[ref.mark]<=upp)&
                           (all.rank[ref.mark]>=low)
   }
   m.ref<-sum(ref.mark)
   ref.rank<-(rank(y[ref.mark])-0.5)/m.ref
   ref.rank.all<-approx(c(0,all.rank[ref.mark],1),
                        c(0,ref.rank,1),xout=all.rank)$y
   y.new<-qtarget(ref.rank.all,target.p1,target.p2)
   return(list(y.norm=y.new,ref.orig=ref.orig,ref.trim=ref.mark))
}

vcut.file<-function(input.file,keep.cols,output.file,rows.per.read=5000)

{
   # Print message
   print(paste("Extracting Desired Columns from",input.file,date()))
   
   
# Read header and write desired columns
   hdr<-scan(input.file,sep="\n",n=1,what=character())
   hdr<-unlist(strsplit(hdr,split="\t"))
   keep.hdr<-matrix(hdr[keep.cols],1)
   write.table(keep.hdr,output.file,append=F,sep="\t",
               col.names=F,row.names=F,quote=F)

   # Preview size of input file
   fld.cnt<-count.fields(input.file)
   nrows.data<-length(fld.cnt)

   # Initialize variables for loop
   row1<-2
   rown<-min(nrows.data,row1+rows.per.read-1)

   # Loop to read sections of input data at a time
   while (row1<=nrows.data)
   {
      rown<-min(nrows.data,row1+rows.per.read-1)              # Determine last row for block
      print(paste("Reading rows",row1-1,"-",rown-1,date()))   # Print message
      X0<-read.table(input.file,sep="\t",as.is=T,             # Read section of file
                     header=F,skip=row1-1,
                     nrows=rown-row1+1)
      X1<-X0[,keep.cols]                                      # Subset on desired rows
      print(paste("Writing Desired Columns of Rows",
            row1-1,"-",rown-1,"to",output.file,date()))       # Print message
      write.table(X1,output.file,sep="\t",row.names=F,        # Write to output file
                  col.names=F,quote=F,append=T)
      row1<-rown+1                                            # Increment starting row for reading
   }
}

vpaste.files<-function(infiles,outfile,rows.per.read=5000)

{
   nfiles<-length(infiles)
   ncols<-rep(NA,nfiles)
   colind<-matrix(NA,nfiles,2)
   hdr<-NULL
   for (i in 1:nfiles)
   {
      temp<-scan(infiles[i],what=character(),sep="\n",n=1)
      temp<-unlist(strsplit(temp,split="\t"))
      ncols[i]<-length(temp)
      hdr<-c(hdr,temp)
   }
   colind[,1]<-c(1,1+cumsum(ncols[2:nfiles]))
   colind[,2]<-cumsum(ncols)
   print(colind)
   print(ncols)
   hdr.mtx<-matrix(hdr,1,sum(ncols))
   hdr.line<-paste(hdr,collapse="\t")
   write.table(hdr.mtx,outfile,sep="\t",col.names=F,row.names=F,quote=F)

   fld.cnt<-count.fields(infiles[1])
   nrows.data<-length(fld.cnt)
   row1<-1
   rown<-min(row1+rows.per.read-1,nrows.data)
   while(row1<=nrows.data)
   {
       rows.to.read<-(rown-row1+1)
       temp0<-scan(infiles[1],what=character(),skip=row1,n=rows.to.read,sep="\n")
       for (i in 2:nfiles)
       {
          temp1<-scan(infiles[i],what=character(),skip=row1,n=rows.to.read,sep="\n")
          temp0<-paste(temp0,temp1,sep="\t")
       }
       row1<-rown+1
       rown<-min(row1+rows.per.read-1,nrows.data)
       write(temp0,outfile,append=T)
   }

}

auto.refchrom.file<-function(prenorm.file,
                             snpann.file,
                             refchr.file,
                             snpann.snpcol,
                             snpann.chromcol,
                             prenorm.snpcol=1,
                             min.hz=0.15,
                             exc.chrom=c(23:24,"X","Y"),
                             logt.signal=T)

{
    print(paste("Reading SNP Annotations:",date()))
    snpann.data<-read.table(snpann.file,sep="\t",as.is=T,header=T)

    prenorm.hdr<-scan(prenorm.file,sep="\n",n=1,what=character())
    prenorm.hdr<-unlist(strsplit(prenorm.hdr,split="\t"))

    
    nhdr<-length(prenorm.hdr)

    print(paste("Reading prenorm file:",date()))
    prenorm.data<-read.table(prenorm.file,sep="\t",as.is=T,header=T)

    prenorm.snps<-prenorm.data[,prenorm.snpcol]
    snpanns.snps<-snpann.data[,snpann.snpcol]

    prenorm.inc<-is.element(prenorm.snps,snpanns.snps)
    if (any(!prenorm.inc)) stop("Prenorm File Includes Some SNPs not present in annotation file.")

    snpann.inc<-is.element(snpanns.snps,prenorm.snps)
    snpanns.snps<-snpanns.snps[snpann.inc]
    snpann.data<-snpann.data[snpann.inc,]

    prenorm.ord<-order(prenorm.snps)
    snpann.ord<-order(snpanns.snps)

    prenorm.data<-prenorm.data[prenorm.ord,]
    snpann.data<-snpann.data[snpann.ord,]

    if (any(snpann.data[,snpann.snpcol]!=prenorm.data[,prenorm.snpcol]))
       stop("SNP IDs do not match in annotation and prenorm files.")

    sgnl.col<-regexpr("call",prenorm.hdr)<0
    sgnl.col[prenorm.snpcol]<-F
    n.sgnl<-sum(sgnl.col)
    sgnl.ind<-(1:nhdr)[sgnl.col]

    refchrom<-rep("0",n.sgnl)
    refsamp<-rep("",n.sgnl)

    chrom<-snpann.data[,snpann.chromcol]

    print(paste("Choosing Reference Chromosomes:",date()))
    for (i in 1:n.sgnl)
    {
        print(paste("  Working on Sample",i,"of",n.sgnl,":",date()))
        y<-prenorm.data[,sgnl.ind[i]]
        call<-prenorm.data[,sgnl.ind[i]+1]
        refsamp[i]<-prenorm.hdr[sgnl.ind[i]]
        refchrom[i]<-pick.ref(y,call,chrom,min.hz,exc.chrom,logt.signal)
    }

    res<-cbind.data.frame(sampID=refsamp,inc.chrom=refchrom)

    print(paste("Writing Reference Chromosome File:",date()))
    write.table(res,refchr.file,sep="\t",row.names=F,col.names=T)
    print(paste("Done with Automated Reference Selection:",date()))
    return(res)
}
                             
pick.ref<-function(y,call,chrom,min.hz=0.1,exc.chrom=c(23,24,"X","Y"),logt.signal=T)

{
   uchrom<-unique(chrom[!is.element(chrom,exc.chrom)])
   nchrom<-length(uchrom)
   hz.rate<-ave.sgnl<-sd.sgnl<-rep(NA,nchrom)
   if (logt.signal)  y<-log(y)
   for (i in 1:nchrom)
   {
      this.chrom<-(chrom==uchrom[i])
      hz.rate[i]<-mean(call[this.chrom]=="AB")
      ave.sgnl[i]<-mean(y[this.chrom])
      sd.sgnl[i]<-sd(y[this.chrom])
   }
   hz.exc<-(hz.rate<=min.hz)
   chr.inc<-uchrom[!hz.exc]

   if (length(chr.inc)==0) return("0")

   ave.inc<-ave.sgnl[!hz.exc]
   sd.inc<-sd.sgnl[!hz.exc]

   rnk.ave<-rank(ave.inc)
   rnk.sd<-rank(sd.inc)
  
   rnk.sum<-rnk.ave+rnk.sd

   min.sum<-rnk.sum==min(rnk.sum)

   chr.pick<-chr.inc[min.sum]
   if (length(chr.pick)>1)  chr.list<-paste(chr.pick,collapse=",")
   else chr.list<-as.character(chr.pick)

   return(chr.list)   
}


annsubset.file<-function(workdir,
                         input.file,
                         ann.file,
                         output.file,
                         input.marker.col,
                         ann.marker.col,
                         max.datapts.mem=1e7)

{
   # Generate full file name strings
   workdir.end<-substring(workdir,nchar(workdir))
   if (workdir.end=="\\") workdir.sep=""
   else workdir.sep="\\"

   input.file<-paste(workdir,input.file,sep=workdir.sep)
   ann.file<-paste(workdir,ann.file,sep=workdir.sep)
   output.file<-paste(workdir,output.file,sep=workdir.sep)

   # Read annotation data into memory
   ann.data<-read.table(ann.file,as.is=T,header=T,sep="\t")
   ann.snps<-ann.data[,ann.marker.col]

   # Determine size of input file
   fld.cnt<-count.fields(input.file,sep="\t")
   if (sum(fld.cnt)<max.datapts.mem) # Determine if full input file can be read into memory
   {
      input.data<-read.table(input.file,as.is=T,header=T,sep="\t")
      input.snps<-input.data[,input.marker.col]
      keep.snps<-is.element(input.snps,ann.snps)
      input.data<-input.data[keep.snps,]
      write.table(input.data,output.file,sep="\t",row.names=F,quote=F)  
   }
   else # Can't be read into memory
   {
      rows.per.read<-max.datapts.mem%/%(3*fld.cnt[1])
      nrows.data<-length(fld.cnt)
      row1<-1
      nrd<-0
      rown<-min(row1+rows.per.read-1,nrows.data)
      while (row1<=nrows.data)
      {
         rown<-min(nrows.data,row1+rows.per.read-1)
         print(paste("Reading rows",row1,"-",rown,date()))
         X0<-read.table(input.file,sep="\t",as.is=T,
                        header=(nrd==0),skip=row1-1,
                        nrows=rown-row1+1)

         keep.snps<-is.element(X0[,input.marker.col],ann.snps)
         X1<-X0[keep.snps,]
         rep.row1<-row1-(nrd>0)
         rep.rown<-rown-(nrd>0)
         print(paste("Writing Desired Rows among rows",rep.row1,"-",rep.rown,date()))
         write.table(X1,output.file,sep="\t",row.names=F,
                     col.names=(nrd==0),quote=F,append=T)
         nrd<-nrd+1
         row1<-rown+1+(nrd==1)
      }
   }
}