Capture of differentially expressed genes are essential in analyzing -omics data, especially in gene expression analysis. In the gene expression analysis, samples with different condidtions are collected and compared. The brt package provides a method of conducting the statisticial inference by integrating 'biological relevance' effects. This vignette explains the use of the package with simulated microarray data, and count data. brt package version: 1.3.0.
sample_n = 100
gene_n = 100
set.seed(101)
gene_mean_diff = rnorm(gene_n, 0, 5)
grand_mean = 100
sample_sd = 1
simulate_pop =
do.call(cbind
, lapply(1:gene_n, function(k){
out = data.frame(gene=c(rnorm(sample_n, 0, sample_sd)+grand_mean
,rnorm(sample_n, gene_mean_diff[k], sample_sd)+grand_mean)
)
colnames(out) = paste0('gene_', k)
return(out)
}))
dta = cbind(pop=rep(c('trt', 'contr'), each=sample_n), samples=rep(c(1:sample_n), 2), simulate_pop)
dta[c(1:5, 101:105), 1:7]
## pop samples gene_1 gene_2 gene_3 gene_4 gene_5
## 1 trt 1 100.26807 98.83414 98.72764 100.02777 100.05739
## 2 trt 2 99.40779 99.76020 99.33693 100.14471 99.91227
## 3 trt 3 102.13349 101.18438 99.70346 99.76244 99.54522
## 4 trt 4 101.17275 100.20751 99.42603 99.25715 101.45655
## 5 trt 5 100.74676 99.95637 99.97383 100.26091 101.61884
## 101 contr 1 98.20579 102.44177 96.23597 99.61206 102.05704
## 102 contr 2 96.98654 100.67039 95.55263 101.37330 99.45638
## 103 contr 3 98.79333 100.91048 96.22993 101.37826 101.45936
## 104 contr 4 97.57933 103.20197 95.78029 101.44521 102.56839
## 105 contr 5 99.57974 102.90421 95.96934 100.83086 101.47445
summary(dta[, c(paste0('gene_',1:4))])
## gene_1 gene_2 gene_3 gene_4
## Min. : 96.20 Min. : 96.82 Min. : 94.12 Min. : 96.87
## 1st Qu.: 98.28 1st Qu.: 99.96 1st Qu.: 96.56 1st Qu.: 99.71
## Median : 99.17 Median :101.32 Median : 98.22 Median :100.47
## Mean : 99.16 Mean :101.26 Mean : 98.28 Mean :100.49
## 3rd Qu.: 99.93 3rd Qu.:102.50 3rd Qu.:100.07 3rd Qu.:101.37
## Max. :102.13 Max. :104.86 Max. :102.23 Max. :103.41
if(!'reshape2'%in%rownames(installed.packages())){install.packages('reshape2')}
plot.dta = reshape2::melt(dta[, c('pop', 'gene_1', 'gene_2', 'gene_3', 'gene_4')], id.vars='pop')
colnames(plot.dta) = c('pop', 'gene', 'expressionValue')
if(!'ggplot2'%in%rownames(installed.packages())){install.packages('ggplot2')}
library(ggplot2)
ggplot(plot.dta, aes(expressionValue, color=pop, fill=pop))+
geom_density(alpha=0.1)+
facet_wrap(~gene, nrow=1)+
theme_bw()+
theme(panel.grid=element_blank())
brt_cut = 5
library(brt)
print(gene_mean_diff)
## [1] -1.63018245 2.76230928 -3.37471922 1.07179730 1.55384609
## [6] 5.86983144 3.09394928 -0.56367157 4.58514145 -1.11629682
## [11] 2.63224049 -3.97422218 7.13877772 -7.33409847 -1.18341689
## [16] -0.96668982 -4.24877370 0.29232749 -4.08835178 -10.25153908
## [21] -0.81877833 3.54261052 -1.33990273 -7.31960880 3.72217911
## [26] -7.05195091 2.33533803 -0.59660054 2.33619481 2.49067782
## [31] 4.47468600 1.39575998 5.03932875 -10.36553245 5.94926692
## [36] -3.62187109 0.83991886 4.60167579 -8.35802406 2.24234535
## [41] 2.41229405 3.79106891 -11.59663684 -2.29752390 -5.52691833
## [46] 2.01464137 2.84467459 -3.53041641 -1.45045313 -7.41939036
## [51] -5.75127641 -1.37235581 2.88950501 -6.98451323 3.74528858
## [56] -5.25593348 0.82690435 5.64904560 5.86861232 -2.13931616
## [61] -1.29901054 -7.05586522 -3.20678777 0.56228755 2.11302166
## [66] 1.93417646 -3.43899163 0.74451244 -0.28824874 -0.37411682
## [71] 7.54948719 8.09968504 5.76579083 -0.38801798 -9.09467250
## [76] -5.18722291 1.51246123 -6.38973084 0.69169524 -0.25492062
## [81] 9.26073787 5.55837635 -2.55687661 -2.71940552 -8.64463642
## [86] 2.35374769 0.02693561 6.74022893 3.62048357 7.76274582
## [91] 6.62734916 -0.17132546 -1.80506699 -3.60082711 1.41007467
## [96] -3.95262832 -2.22452275 6.82496585 2.48727169 -4.07198238
brt.result =
do.call(rbind
, lapply(1:gene_n, function(i){
dta.tmp = dta[, c(1, i+2)]
pop1 = subset(dta.tmp, pop=='trt')[, 2, drop=T]
pop2 = subset(dta.tmp, pop=='contr')[, 2, drop=T]
brt.tmp = brt.test(x=pop1, y=pop2, hi=brt_cut, var.equal=T)
t.tmp = t.test(x=pop1, y=pop2, var.equal=T)
out = data.frame(gene=colnames(dta.tmp)[2]
, true_diff=gene_mean_diff[i]
, est_diff=brt.tmp$mu_y-brt.tmp$mu_x
, brt.test=brt.tmp$brt.pvalue
, t.test=t.tmp$p.value)
}))
head(brt.result)
## gene true_diff est_diff brt.test t.test
## 1 gene_1 -1.630182 -1.586110 6.831999e-01 4.765126e-25
## 2 gene_2 2.762309 2.576459 4.854434e-01 9.877132e-44
## 3 gene_3 -3.374719 -3.544225 2.920143e-01 1.073457e-73
## 4 gene_4 1.071797 1.259994 7.483451e-01 4.058920e-17
## 5 gene_5 1.553846 1.499169 7.006130e-01 1.797829e-19
## 6 gene_6 5.869831 6.019427 6.171728e-13 2.531744e-96
plot.pvalue = reshape2::melt(brt.result[, c('brt.test', 't.test')])
## No id variables; using all as measure variables
ggplot(plot.pvalue, aes(value, color=variable, fill=variable))+
geom_histogram(alpha=0.5)+
facet_wrap(~variable, nrow=1, scales='free_y')+
theme_bw()+
theme(panel.grid=element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot.diff.value = reshape2::melt(
with(brt.result
, data.frame(est_diff=est_diff, log_brt=-log(brt.test), log_t=-log(t.test))
)
, id.vars='est_diff')
ggplot(plot.diff.value, aes(x=est_diff, y=value, color=variable))+
geom_point()+
geom_hline(yintercept=-log(0.05), color='red')+
facet_wrap(~variable)+
theme_bw()+
theme(panel.grid=element_blank())
brt.result$fdr.brt.pvalue = p.adjust(brt.result$brt.test, method='BH')
brt.result$fdr.t.test.pvalue = p.adjust(brt.result$t.test, method='BH')
head(brt.result)
## gene true_diff est_diff brt.test t.test fdr.brt.pvalue
## 1 gene_1 -1.630182 -1.586110 6.831999e-01 4.765126e-25 9.089484e-01
## 2 gene_2 2.762309 2.576459 4.854434e-01 9.877132e-44 8.227854e-01
## 3 gene_3 -3.374719 -3.544225 2.920143e-01 1.073457e-73 5.959475e-01
## 4 gene_4 1.071797 1.259994 7.483451e-01 4.058920e-17 9.253561e-01
## 5 gene_5 1.553846 1.499169 7.006130e-01 1.797829e-19 9.098871e-01
## 6 gene_6 5.869831 6.019427 6.171728e-13 2.531744e-96 2.571553e-12
## fdr.t.test.pvalue
## 1 6.527569e-25
## 2 1.646189e-43
## 3 2.752454e-73
## 4 5.011013e-17
## 5 2.304909e-19
## 6 9.737475e-96
confusion.dta = with(brt.result
, data.frame(gene=brt.result$gene
, truly_diff=factor(true_diff>=brt_cut, levels=c('TRUE', 'FALSE'))
, brt_diff=factor(fdr.brt.pvalue<=0.05, levels=c('TRUE', 'FALSE'))
, t_diff=factor(fdr.t.test.pvalue<=0.05, levels=c('TRUE', 'FALSE')))
)
### confusion matrix based on BRT
with(confusion.dta, table(brt_diff, truly_diff))
## truly_diff
## brt_diff TRUE FALSE
## TRUE 14 17
## FALSE 1 68
### confusion matrix based on t.test
with(confusion.dta, table(t_diff, truly_diff))
## truly_diff
## t_diff TRUE FALSE
## TRUE 15 81
## FALSE 0 4
According to the confusion matrix, the accurate of BRT in discovering the expression value signficantly differ from (-5, 5) = (14+68)/100 = 0.82, but in t.test = (15+4)/100 = 0.19
if(!'pasilla'%in%rownames(installed.packages())){source("https://bioconductor.org/biocLite.R"); biocLite("pasilla")}
library(pasilla)
CountDataFile = system.file('extdata', 'pasilla_gene_counts.tsv', package='pasilla', mustWork=T)
MetaDataFile = system.file('extdata', 'pasilla_sample_annotation.csv', package='pasilla', mustWork=T)
countData = read.csv(CountDataFile, sep='\t')
colnames(countData)[-1]=paste0(colnames(countData)[-1], 'fb')
metaData = read.csv(MetaDataFile)
head(countData)
## gene_id untreated1fb untreated2fb untreated3fb untreated4fb
## 1 FBgn0000003 0 0 0 0
## 2 FBgn0000008 92 161 76 70
## 3 FBgn0000014 5 1 0 0
## 4 FBgn0000015 0 2 1 2
## 5 FBgn0000017 4664 8714 3564 3150
## 6 FBgn0000018 583 761 245 310
## treated1fb treated2fb treated3fb
## 1 0 0 1
## 2 140 88 70
## 3 4 0 0
## 4 1 0 0
## 5 6205 3072 3334
## 6 722 299 308
head(metaData)
## file condition type number.of.lanes total.number.of.reads
## 1 treated1fb treated single-read 5 35158667
## 2 treated2fb treated paired-end 2 12242535 (x2)
## 3 treated3fb treated paired-end 2 12443664 (x2)
## 4 untreated1fb untreated single-read 2 17812866
## 5 untreated2fb untreated single-read 6 34284521
## 6 untreated3fb untreated paired-end 2 10542625 (x2)
## exon.counts
## 1 15679615
## 2 15620018
## 3 12733865
## 4 14924838
## 5 20764558
## 6 10283129
if(!'DESeq2'%in%rownames(installed.packages())){source("https://bioconductor.org/biocLite.R"); biocLite("DESeq2")}
### construct the DESeq object
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, cbind, colMeans, colSums, colnames, do.call,
## duplicated, eval, evalq, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, rank, rbind, rowMeans,
## rowSums, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 3.4.3
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
##
## apply
countRawDeseq2 = DESeqDataSetFromMatrix(countData=as.matrix(countData[,-1])
, colData=metaData[, c('file', 'condition')]
, design=~condition)
### apply vst transformation
vstData = vst(countRawDeseq2)
### explore the mean-variance stablization after vst
if(!'vsn'%in%rownames(installed.packages())){source("https://bioconductor.org/biocLite.R"); biocLite("vsn")}
vsn::meanSdPlot(assay(vstData))
## Warning: package 'hexbin' was built under R version 3.4.3
### extract the transformed data
dta_vst = data.frame(gene_id=countData[,1], assay(vstData))
### apply brt to compare the treatment effect
countResult = do.call(rbind
, lapply(1:nrow(dta_vst), function(i){
tmp1 = brt.test(x=dta_vst[i,c(2:5)], y=dta_vst[i,c(6:8)], hi=0.2)
out = data.frame(gene=dta_vst[i,1]
, est_diff=tmp1$mu_y-tmp1$mu_x
, brt.test=tmp1$brt.pvalue
)
}))
hist(countResult$brt.test)