Simulation mark kimpel.R

From Organic Design wiki

Code snipits and programs written in R, S or S-PLUSCat for microarray articles

  1. LOAD THE FUNCTIONS BELOW BEFORE RUNNING THE FOLLOWING SIMULATIONS

IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13, FC.range = c(1,1), IQR.filt.per = 0, FDR.cutoff = 0.20, percent.genes.real.change = 0, num.runs = 1)

IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13,

   FC.range = c(1,1), IQR.filt.per = 25, FDR.cutoff = 0.20, 

percent.genes.real.change = 0, num.runs = 20)

IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13,

   FC.range = c(1.1, 2.25), IQR.filt.per = 0, FDR.cutoff = 0.20, 

percent.genes.real.change = 3, num.runs = 20)

IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13,

   FC.range = c(1.1, 2.25), IQR.filt.per = 25, FDR.cutoff = 0.20, 

percent.genes.real.change = 3, num.runs = 20)



      1. END OF SIMULATION


  1. SIMULATION FUNCTIONS

IQR.sim.func <- function(num.sim.genes, num.arrays, mean.exprs, coeff.var,

   FC.range, IQR.filt.per, FDR.cutoff, percent.genes.real.change, num.runs)
  {
  1. a couple of utility functions to use later on
  1. fold change calculator

abs.fc.calc.func <- function(m1,m2){

  fc <- mean(m2)/mean(m1); if (fc < 1) {fc <- 1/fc} ; round(fc, digits 

= 2); fc}

  1. coefficient of variation calculator

cv.func <- function(vec){cv <- sd(vec)/mean(vec); cv}

for (run.num in 1:num.runs)

 {
  1. construct matrix of random normal numbers

cv <- coeff.var

mat <-abs(matrix(rnorm((num.sim.genes * num.arrays), mean = mean.exprs,

                   sd = (mean.exprs * cv)), nrow = num.sim.genes, ncol 

= num.arrays))

  1. "Seed" the matrix with genes with uniform dist. of fold changes of
  2. range min.change-max change

per.seed <- percent.genes.real.change #percentage of genes seeded

genes.seed <- (per.seed/100) * num.sim.genes #number of genes seeded

  1. marker for which genes are "true positives"(TRUE) and which are "true
  2. negatives"(FALSE)

genes.seed.logical <- c(rep(TRUE, genes.seed), rep(FALSE,(num.sim.genes - genes.seed)))

min.change <- FC.range[1]

max.change <- FC.range[2]

sim.FC <- seq(from = min.change, to = max.change, length.out = genes.seed)

if (genes.seed != 0)

  {
    for (i in 1:genes.seed) # Do seeding
      {
        mat[i,1:(num.arrays/2)] <- sim.FC[i] * mat[i, 1:(num.arrays/2)]}
  }
  1. Characteristics of all genes

mean.exprs.all <- mean(mat)

mean.cv.exprs.all <-mean(apply(mat, 1, cv.func))

fc.vec.all <- rep(NA, nrow(mat))

for (i in 1:nrow(mat)){fc.vec.all[i] <- # Do we want log space here? abs.fc.calc.func(mat[i,1:(num.arrays/2)], mat[i,((num.arrays/2) + 1):num.arrays])}

range.fc.all <- quantile(fc.vec.all, p = seq(0,1))

  1. IQR filtering

IQR.cutoff <- IQR.filt.per #the percentile of IQR below which will be

  1. filtered out

IQR.vec <- apply(log2(mat), 1, IQR) #compute IQR for each row of mat

  1. ("gene"), needs to be log2

IQR.per <- 100*rank(IQR.vec)/length(IQR.vec) #compute IQR percentile for

  1. each "gene"

IQR.filt <- IQR.per >= IQR.cutoff #T/F filter

mat <- mat[IQR.filt,]

genes.seed.logical <- genes.seed.logical[IQR.filt]

  1. Characteristics of post-filtered genes

mean.exprs.filtered <- mean(mat)

mean.cv.exprs.filtered <- mean(apply(mat, 1, cv.func))

fc.vec.filtered <- rep(NA, nrow(mat))

for (i in 1:nrow(mat)){fc.vec.filtered[i] <-

   abs.fc.calc.func(mat[i,1:(num.arrays/2)], mat[i,((num.arrays/2) + 

1):num.arrays])}

range.fc.filtered <- quantile(fc.vec.filtered, p = seq(0,1))

browser()

  1. Do t-testing

p.vec <- rep(NA, nrow(mat))

for (i in 1:length(p.vec))

  {
    p.vec[i] <- t.test(x = mat[i,1:(num.arrays/2)], y = 

mat[i,((num.arrays/2) + 1):num.arrays],

            alternative = "two.sided", var.equal = TRUE)$p.value
  }

hist(p.vec, breaks = 100, main=paste("Histogram of p values. IQR.filt.per = ", IQR.filt.per,

     ". FC.range is ", FC.range[1], " to ", FC.range[2], sep="") 

,xlab="p value")

require(geneplotter, quiet=T)

savepdf(fn = paste("hist.p.values.IQR.filt.per", IQR.filt.per,

     "FC.range.", FC.range[1], FC.range[2], sep="_"), dir = getwd())


  1. FDR correction using BH method

p.adj.vec <- p.adjust(p.vec, method = "BH")

sig.filt <- p.adj.vec <= FDR.cutoff

tot.sig <- sum(sig.filt) #total of sig. genes

tot.true.neg.sig <- sum(sig.filt & !genes.seed.logical) #true negatives

  1. in genes called sig.

true.FDR <- tot.true.neg.sig/tot.sig

tot.true.pos.not.sig <- sum(!sig.filt & genes.seed.logical) #true

  1. positives in genes called not sig.

true.FNR <- tot.true.pos.not.sig/sum(genes.seed.logical)

  1. characteristics of total positive genes

if (sum(sig.filt) > 1)

  {
    mat <- mat[sig.filt,]
    mean.exprs.sig <- mean(mat)
    mean.cv.exprs.sig <- mean(apply(mat, 1, cv.func))
    fc.vec.sig <- rep(NA, nrow(mat))
    for (i in 1:nrow(mat)){fc.vec.sig[i] <-
        abs.fc.calc.func(mat[i,1:(num.arrays/2)], mat[i,((num.arrays/2) 

+ 1):num.arrays])}

    range.fc.sig <- quantile(fc.vec.sig, p = seq(0,1))
  } else {
    mean.exprs.sig <- NA; mean.cv.exprs.sig <- NA; range.fc.sig <-rep(NA,2)
  }
  1. Create output datastructure

if (run.num == 1)

  {

parameter <- c(

  1. Initial State

"number of simulation runs", "initial number of genes", "initial percentage of significant genes", "true number of significant genes", "initial seeded fold change min.", "initial seeded fold change max", "number of arrays", "initial mean expression", "initial c.v.", "initial actual fold change min.", "initial actual fold change max.",

  1. Post-filtered state

"IQR percentile of filter", "post-fitler number of genes", "post-filter mean expression", "post-filter c.v.", "post-filter fold change min.", "post-filter fold change max.",

  1. Significant Genes

"BH FDR", "sig. genes number", "sig. genes mean expression", "sig. genes c.v.", "sig. genes fold change min.", "sig. genes fold change max.", "sig. genes true FDR", "sig. genes true FNR")

out.mat <- matrix(ncol = 1, nrow = length(parameter))

}


  1. Create output of i and bind to out.mat

output.vec <-

  c(num.runs,

num.sim.genes, per.seed, (num.sim.genes*per.seed/100), FC.range[1], FC.range[2], num.arrays, mean.exprs.all, mean.cv.exprs.all, range.fc.all[1], range.fc.all[2],

  1. Post-filtered state

IQR.filt.per, sum(IQR.filt), mean.exprs.filtered, mean.cv.exprs.filtered, range.fc.filtered[1], range.fc.filtered[2],

  1. Significant Genes

FDR.cutoff, tot.sig, mean.exprs.sig, mean.cv.exprs.sig, range.fc.sig[1], range.fc.sig[2], true.FDR, true.FNR)

if (run.num ==1){out.mat[,1] <- output.vec} else {out.mat <- cbind(out.mat, output.vec)}

print(paste("finished run ", run.num, sep = ""))

}

final.out.mat <- matrix(nrow = length(parameter), ncol = 3)

final.out.mat[,1] <- parameter

colnames(final.out.mat) <- c("parameter", "mean", "std. dev")

for (i in 1:nrow(final.out.mat))

    {
      err.out <- try(out.tmp <- sd(out.mat[i,]), TRUE)
      if(inherits(err.out, "try-error"))
       {final.out.mat[i,2:3]<-rep(NA,2)} else
         {final.out.mat[i,2:3]<-c(mean(out.mat[i,]), sd(out.mat[i,]))}
     }


write.table(final.out.mat, "simulation.output.csv", sep="\t", col.names=T, row.names=F, append=T)

}

      1. END OF R Functions