Simulation mark kimpel.R
- LOAD THE FUNCTIONS BELOW BEFORE RUNNING THE FOLLOWING SIMULATIONS
IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13, FC.range = c(1,1), IQR.filt.per = 0, FDR.cutoff = 0.20, percent.genes.real.change = 0, num.runs = 1)
IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13,
FC.range = c(1,1), IQR.filt.per = 25, FDR.cutoff = 0.20,
percent.genes.real.change = 0, num.runs = 20)
IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13,
FC.range = c(1.1, 2.25), IQR.filt.per = 0, FDR.cutoff = 0.20,
percent.genes.real.change = 3, num.runs = 20)
IQR.sim.func(num.sim.genes = 30000, num.arrays = 12, mean.exprs = 1000, coeff.var= 0.13,
FC.range = c(1.1, 2.25), IQR.filt.per = 25, FDR.cutoff = 0.20,
percent.genes.real.change = 3, num.runs = 20)
- END OF SIMULATION
-
- SIMULATION FUNCTIONS
IQR.sim.func <- function(num.sim.genes, num.arrays, mean.exprs, coeff.var,
FC.range, IQR.filt.per, FDR.cutoff, percent.genes.real.change, num.runs)
{
- a couple of utility functions to use later on
- fold change calculator
abs.fc.calc.func <- function(m1,m2){
fc <- mean(m2)/mean(m1); if (fc < 1) {fc <- 1/fc} ; round(fc, digits
= 2); fc}
- coefficient of variation calculator
cv.func <- function(vec){cv <- sd(vec)/mean(vec); cv}
for (run.num in 1:num.runs)
{
- construct matrix of random normal numbers
cv <- coeff.var
mat <-abs(matrix(rnorm((num.sim.genes * num.arrays), mean = mean.exprs,
sd = (mean.exprs * cv)), nrow = num.sim.genes, ncol
= num.arrays))
- "Seed" the matrix with genes with uniform dist. of fold changes of
- range min.change-max change
per.seed <- percent.genes.real.change #percentage of genes seeded
genes.seed <- (per.seed/100) * num.sim.genes #number of genes seeded
- marker for which genes are "true positives"(TRUE) and which are "true
- negatives"(FALSE)
genes.seed.logical <- c(rep(TRUE, genes.seed), rep(FALSE,(num.sim.genes - genes.seed)))
min.change <- FC.range[1]
max.change <- FC.range[2]
sim.FC <- seq(from = min.change, to = max.change, length.out = genes.seed)
if (genes.seed != 0)
{
for (i in 1:genes.seed) # Do seeding
{ mat[i,1:(num.arrays/2)] <- sim.FC[i] * mat[i, 1:(num.arrays/2)]} }
- Characteristics of all genes
mean.exprs.all <- mean(mat)
mean.cv.exprs.all <-mean(apply(mat, 1, cv.func))
fc.vec.all <- rep(NA, nrow(mat))
for (i in 1:nrow(mat)){fc.vec.all[i] <- # Do we want log space here? abs.fc.calc.func(mat[i,1:(num.arrays/2)], mat[i,((num.arrays/2) + 1):num.arrays])}
range.fc.all <- quantile(fc.vec.all, p = seq(0,1))
- IQR filtering
IQR.cutoff <- IQR.filt.per #the percentile of IQR below which will be
- filtered out
IQR.vec <- apply(log2(mat), 1, IQR) #compute IQR for each row of mat
- ("gene"), needs to be log2
IQR.per <- 100*rank(IQR.vec)/length(IQR.vec) #compute IQR percentile for
- each "gene"
IQR.filt <- IQR.per >= IQR.cutoff #T/F filter
mat <- mat[IQR.filt,]
genes.seed.logical <- genes.seed.logical[IQR.filt]
- Characteristics of post-filtered genes
mean.exprs.filtered <- mean(mat)
mean.cv.exprs.filtered <- mean(apply(mat, 1, cv.func))
fc.vec.filtered <- rep(NA, nrow(mat))
for (i in 1:nrow(mat)){fc.vec.filtered[i] <-
abs.fc.calc.func(mat[i,1:(num.arrays/2)], mat[i,((num.arrays/2) +
1):num.arrays])}
range.fc.filtered <- quantile(fc.vec.filtered, p = seq(0,1))
browser()
- Do t-testing
p.vec <- rep(NA, nrow(mat))
for (i in 1:length(p.vec))
{
p.vec[i] <- t.test(x = mat[i,1:(num.arrays/2)], y =
mat[i,((num.arrays/2) + 1):num.arrays],
alternative = "two.sided", var.equal = TRUE)$p.value
}
hist(p.vec, breaks = 100, main=paste("Histogram of p values. IQR.filt.per = ", IQR.filt.per,
". FC.range is ", FC.range[1], " to ", FC.range[2], sep="")
,xlab="p value")
require(geneplotter, quiet=T)
savepdf(fn = paste("hist.p.values.IQR.filt.per", IQR.filt.per,
"FC.range.", FC.range[1], FC.range[2], sep="_"), dir = getwd())
- FDR correction using BH method
p.adj.vec <- p.adjust(p.vec, method = "BH")
sig.filt <- p.adj.vec <= FDR.cutoff
tot.sig <- sum(sig.filt) #total of sig. genes
tot.true.neg.sig <- sum(sig.filt & !genes.seed.logical) #true negatives
- in genes called sig.
true.FDR <- tot.true.neg.sig/tot.sig
tot.true.pos.not.sig <- sum(!sig.filt & genes.seed.logical) #true
- positives in genes called not sig.
true.FNR <- tot.true.pos.not.sig/sum(genes.seed.logical)
- characteristics of total positive genes
if (sum(sig.filt) > 1)
{
mat <- mat[sig.filt,]
mean.exprs.sig <- mean(mat)
mean.cv.exprs.sig <- mean(apply(mat, 1, cv.func))
fc.vec.sig <- rep(NA, nrow(mat))
for (i in 1:nrow(mat)){fc.vec.sig[i] <- abs.fc.calc.func(mat[i,1:(num.arrays/2)], mat[i,((num.arrays/2)
+ 1):num.arrays])}
range.fc.sig <- quantile(fc.vec.sig, p = seq(0,1))
} else { mean.exprs.sig <- NA; mean.cv.exprs.sig <- NA; range.fc.sig <-rep(NA,2) }
- Create output datastructure
if (run.num == 1)
{
parameter <- c(
- Initial State
"number of simulation runs", "initial number of genes", "initial percentage of significant genes", "true number of significant genes", "initial seeded fold change min.", "initial seeded fold change max", "number of arrays", "initial mean expression", "initial c.v.", "initial actual fold change min.", "initial actual fold change max.",
- Post-filtered state
"IQR percentile of filter", "post-fitler number of genes", "post-filter mean expression", "post-filter c.v.", "post-filter fold change min.", "post-filter fold change max.",
- Significant Genes
"BH FDR", "sig. genes number", "sig. genes mean expression", "sig. genes c.v.", "sig. genes fold change min.", "sig. genes fold change max.", "sig. genes true FDR", "sig. genes true FNR")
out.mat <- matrix(ncol = 1, nrow = length(parameter))
}
- Create output of i and bind to out.mat
output.vec <-
c(num.runs,
num.sim.genes, per.seed, (num.sim.genes*per.seed/100), FC.range[1], FC.range[2], num.arrays, mean.exprs.all, mean.cv.exprs.all, range.fc.all[1], range.fc.all[2],
- Post-filtered state
IQR.filt.per, sum(IQR.filt), mean.exprs.filtered, mean.cv.exprs.filtered, range.fc.filtered[1], range.fc.filtered[2],
- Significant Genes
FDR.cutoff, tot.sig, mean.exprs.sig, mean.cv.exprs.sig, range.fc.sig[1], range.fc.sig[2], true.FDR, true.FNR)
if (run.num ==1){out.mat[,1] <- output.vec} else {out.mat <- cbind(out.mat, output.vec)}
print(paste("finished run ", run.num, sep = ""))
}
final.out.mat <- matrix(nrow = length(parameter), ncol = 3)
final.out.mat[,1] <- parameter
colnames(final.out.mat) <- c("parameter", "mean", "std. dev")
for (i in 1:nrow(final.out.mat))
{
err.out <- try(out.tmp <- sd(out.mat[i,]), TRUE)
if(inherits(err.out, "try-error"))
{final.out.mat[i,2:3]<-rep(NA,2)} else
{final.out.mat[i,2:3]<-c(mean(out.mat[i,]), sd(out.mat[i,]))} }
write.table(final.out.mat, "simulation.output.csv", sep="\t",
col.names=T, row.names=F, append=T)
}
- END OF R Functions
-