BRT index: An Integration of Biological Relevance and p-value for -omics data

Capture of differentially expressed genes are essential in analyzing -omics data, especially in gene expression analysis. In the gene expression analysis, samples with different condidtions are collected and compared. The brt package provides a method of conducting the statisticial inference by integrating 'biological relevance' effects. This vignette explains the use of the package with simulated microarray data, and count data. brt package version: 1.3.0.

data simulation

Set sample size = 100 with 100 genes measured.

sample_n = 100
gene_n = 100

Using normal distribution to simulated the mean difference with mean 20, standard deviation 5, 100 genes

set.seed(101)
gene_mean_diff = rnorm(gene_n, 0, 5)

Using each of the gene_mean_diff to simulate the 100 samples of each condidtion along with the grand mean = 100, and sample standard deviation = 1

grand_mean = 100
sample_sd = 1
simulate_pop = 
  do.call(cbind
          , lapply(1:gene_n, function(k){
            out = data.frame(gene=c(rnorm(sample_n, 0, sample_sd)+grand_mean
                                  ,rnorm(sample_n, gene_mean_diff[k], sample_sd)+grand_mean)
            )
            colnames(out) = paste0('gene_', k)
            return(out)
          }))
dta = cbind(pop=rep(c('trt', 'contr'), each=sample_n), samples=rep(c(1:sample_n), 2), simulate_pop)
dta[c(1:5, 101:105), 1:7]

##       pop samples    gene_1    gene_2   gene_3    gene_4    gene_5
## 1     trt       1 100.26807  98.83414 98.72764 100.02777 100.05739
## 2     trt       2  99.40779  99.76020 99.33693 100.14471  99.91227
## 3     trt       3 102.13349 101.18438 99.70346  99.76244  99.54522
## 4     trt       4 101.17275 100.20751 99.42603  99.25715 101.45655
## 5     trt       5 100.74676  99.95637 99.97383 100.26091 101.61884
## 101 contr       1  98.20579 102.44177 96.23597  99.61206 102.05704
## 102 contr       2  96.98654 100.67039 95.55263 101.37330  99.45638
## 103 contr       3  98.79333 100.91048 96.22993 101.37826 101.45936
## 104 contr       4  97.57933 103.20197 95.78029 101.44521 102.56839
## 105 contr       5  99.57974 102.90421 95.96934 100.83086 101.47445

summary(dta[, c(paste0('gene_',1:4))])

##      gene_1           gene_2           gene_3           gene_4      
##  Min.   : 96.20   Min.   : 96.82   Min.   : 94.12   Min.   : 96.87  
##  1st Qu.: 98.28   1st Qu.: 99.96   1st Qu.: 96.56   1st Qu.: 99.71  
##  Median : 99.17   Median :101.32   Median : 98.22   Median :100.47  
##  Mean   : 99.16   Mean   :101.26   Mean   : 98.28   Mean   :100.49  
##  3rd Qu.: 99.93   3rd Qu.:102.50   3rd Qu.:100.07   3rd Qu.:101.37  
##  Max.   :102.13   Max.   :104.86   Max.   :102.23   Max.   :103.41

Plot the density curve of gene_1, gene_2, gene_3, and gene_4

if(!'reshape2'%in%rownames(installed.packages())){install.packages('reshape2')}
plot.dta = reshape2::melt(dta[, c('pop', 'gene_1', 'gene_2', 'gene_3', 'gene_4')], id.vars='pop')
colnames(plot.dta) = c('pop', 'gene', 'expressionValue')
if(!'ggplot2'%in%rownames(installed.packages())){install.packages('ggplot2')}
library(ggplot2)
ggplot(plot.dta, aes(expressionValue, color=pop, fill=pop))+
  geom_density(alpha=0.1)+
  facet_wrap(~gene, nrow=1)+
  theme_bw()+
  theme(panel.grid=element_blank())

plot of chunk unnamed-chunk-4

applying brt on the simulated data

apply the brt test on the simulated data with the significant range = c(-5, 5). A student t-test was applied for comparing the result

brt_cut = 5
library(brt)
print(gene_mean_diff)

##   [1]  -1.63018245   2.76230928  -3.37471922   1.07179730   1.55384609
##   [6]   5.86983144   3.09394928  -0.56367157   4.58514145  -1.11629682
##  [11]   2.63224049  -3.97422218   7.13877772  -7.33409847  -1.18341689
##  [16]  -0.96668982  -4.24877370   0.29232749  -4.08835178 -10.25153908
##  [21]  -0.81877833   3.54261052  -1.33990273  -7.31960880   3.72217911
##  [26]  -7.05195091   2.33533803  -0.59660054   2.33619481   2.49067782
##  [31]   4.47468600   1.39575998   5.03932875 -10.36553245   5.94926692
##  [36]  -3.62187109   0.83991886   4.60167579  -8.35802406   2.24234535
##  [41]   2.41229405   3.79106891 -11.59663684  -2.29752390  -5.52691833
##  [46]   2.01464137   2.84467459  -3.53041641  -1.45045313  -7.41939036
##  [51]  -5.75127641  -1.37235581   2.88950501  -6.98451323   3.74528858
##  [56]  -5.25593348   0.82690435   5.64904560   5.86861232  -2.13931616
##  [61]  -1.29901054  -7.05586522  -3.20678777   0.56228755   2.11302166
##  [66]   1.93417646  -3.43899163   0.74451244  -0.28824874  -0.37411682
##  [71]   7.54948719   8.09968504   5.76579083  -0.38801798  -9.09467250
##  [76]  -5.18722291   1.51246123  -6.38973084   0.69169524  -0.25492062
##  [81]   9.26073787   5.55837635  -2.55687661  -2.71940552  -8.64463642
##  [86]   2.35374769   0.02693561   6.74022893   3.62048357   7.76274582
##  [91]   6.62734916  -0.17132546  -1.80506699  -3.60082711   1.41007467
##  [96]  -3.95262832  -2.22452275   6.82496585   2.48727169  -4.07198238

brt.result = 
  do.call(rbind
          , lapply(1:gene_n, function(i){
            dta.tmp = dta[, c(1, i+2)]
            pop1 = subset(dta.tmp, pop=='trt')[, 2, drop=T]
            pop2 = subset(dta.tmp, pop=='contr')[, 2, drop=T]
            brt.tmp = brt.test(x=pop1, y=pop2, hi=brt_cut, var.equal=T)
            t.tmp = t.test(x=pop1, y=pop2, var.equal=T)
            out = data.frame(gene=colnames(dta.tmp)[2]
                             , true_diff=gene_mean_diff[i]
                             , est_diff=brt.tmp$mu_y-brt.tmp$mu_x
                             , brt.test=brt.tmp$brt.pvalue
                             , t.test=t.tmp$p.value)
          }))
head(brt.result)

##     gene true_diff  est_diff     brt.test       t.test
## 1 gene_1 -1.630182 -1.586110 6.831999e-01 4.765126e-25
## 2 gene_2  2.762309  2.576459 4.854434e-01 9.877132e-44
## 3 gene_3 -3.374719 -3.544225 2.920143e-01 1.073457e-73
## 4 gene_4  1.071797  1.259994 7.483451e-01 4.058920e-17
## 5 gene_5  1.553846  1.499169 7.006130e-01 1.797829e-19
## 6 gene_6  5.869831  6.019427 6.171728e-13 2.531744e-96

plot the density curve of brt.pvalue and t.test p-value

plot.pvalue = reshape2::melt(brt.result[, c('brt.test', 't.test')])

## No id variables; using all as measure variables

ggplot(plot.pvalue, aes(value, color=variable, fill=variable))+
  geom_histogram(alpha=0.5)+
  facet_wrap(~variable, nrow=1, scales='free_y')+
  theme_bw()+
  theme(panel.grid=element_blank())

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot of chunk unnamed-chunk-6

plot the YRP plot and the volcano plot

plot.diff.value = reshape2::melt(
                        with(brt.result
                             , data.frame(est_diff=est_diff, log_brt=-log(brt.test), log_t=-log(t.test))
                        )
                  , id.vars='est_diff')
ggplot(plot.diff.value, aes(x=est_diff, y=value, color=variable))+
  geom_point()+
  geom_hline(yintercept=-log(0.05), color='red')+
  facet_wrap(~variable)+
  theme_bw()+
  theme(panel.grid=element_blank())

plot of chunk unnamed-chunk-7

compute the FDR for the brt.pvalue and t.test p-value

brt.result$fdr.brt.pvalue = p.adjust(brt.result$brt.test, method='BH')
brt.result$fdr.t.test.pvalue = p.adjust(brt.result$t.test, method='BH')
head(brt.result)

##     gene true_diff  est_diff     brt.test       t.test fdr.brt.pvalue
## 1 gene_1 -1.630182 -1.586110 6.831999e-01 4.765126e-25   9.089484e-01
## 2 gene_2  2.762309  2.576459 4.854434e-01 9.877132e-44   8.227854e-01
## 3 gene_3 -3.374719 -3.544225 2.920143e-01 1.073457e-73   5.959475e-01
## 4 gene_4  1.071797  1.259994 7.483451e-01 4.058920e-17   9.253561e-01
## 5 gene_5  1.553846  1.499169 7.006130e-01 1.797829e-19   9.098871e-01
## 6 gene_6  5.869831  6.019427 6.171728e-13 2.531744e-96   2.571553e-12
##   fdr.t.test.pvalue
## 1      6.527569e-25
## 2      1.646189e-43
## 3      2.752454e-73
## 4      5.011013e-17
## 5      2.304909e-19
## 6      9.737475e-96

compute the confusion matrix of brt.pvalue and t.test p-value

confusion.dta = with(brt.result
                     , data.frame(gene=brt.result$gene
                                  , truly_diff=factor(true_diff>=brt_cut, levels=c('TRUE', 'FALSE'))
                                  , brt_diff=factor(fdr.brt.pvalue<=0.05, levels=c('TRUE', 'FALSE'))
                                  , t_diff=factor(fdr.t.test.pvalue<=0.05, levels=c('TRUE', 'FALSE')))
                    )
### confusion matrix based on BRT
with(confusion.dta, table(brt_diff, truly_diff))

##         truly_diff
## brt_diff TRUE FALSE
##    TRUE    14    17
##    FALSE    1    68

### confusion matrix based on t.test
with(confusion.dta, table(t_diff, truly_diff))

##        truly_diff
## t_diff  TRUE FALSE
##   TRUE    15    81
##   FALSE    0     4

According to the confusion matrix, the accurate of BRT in discovering the expression value signficantly differ from (-5, 5) = (14+68)/100 = 0.82, but in t.test = (15+4)/100 = 0.19

brt workflow using count data

Import data from pasilla package (Brooks, et al., 2011)

if(!'pasilla'%in%rownames(installed.packages())){source("https://bioconductor.org/biocLite.R"); biocLite("pasilla")}
library(pasilla)
CountDataFile = system.file('extdata', 'pasilla_gene_counts.tsv', package='pasilla', mustWork=T)
MetaDataFile = system.file('extdata', 'pasilla_sample_annotation.csv', package='pasilla', mustWork=T)
countData = read.csv(CountDataFile, sep='\t')
colnames(countData)[-1]=paste0(colnames(countData)[-1], 'fb')
metaData = read.csv(MetaDataFile)
head(countData)

##       gene_id untreated1fb untreated2fb untreated3fb untreated4fb
## 1 FBgn0000003            0            0            0            0
## 2 FBgn0000008           92          161           76           70
## 3 FBgn0000014            5            1            0            0
## 4 FBgn0000015            0            2            1            2
## 5 FBgn0000017         4664         8714         3564         3150
## 6 FBgn0000018          583          761          245          310
##   treated1fb treated2fb treated3fb
## 1          0          0          1
## 2        140         88         70
## 3          4          0          0
## 4          1          0          0
## 5       6205       3072       3334
## 6        722        299        308

head(metaData)

##           file condition        type number.of.lanes total.number.of.reads
## 1   treated1fb   treated single-read               5              35158667
## 2   treated2fb   treated  paired-end               2         12242535 (x2)
## 3   treated3fb   treated  paired-end               2         12443664 (x2)
## 4 untreated1fb untreated single-read               2              17812866
## 5 untreated2fb untreated single-read               6              34284521
## 6 untreated3fb untreated  paired-end               2         10542625 (x2)
##   exon.counts
## 1    15679615
## 2    15620018
## 3    12733865
## 4    14924838
## 5    20764558
## 6    10283129

Apply Variance Stabilizing Transformation (Tibshirani 1988, Huber, et al. 2003, Anders and Huber 2010)

if(!'DESeq2'%in%rownames(installed.packages())){source("https://bioconductor.org/biocLite.R"); biocLite("DESeq2")}
### construct the DESeq object
library(DESeq2)

## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, cbind, colMeans, colSums, colnames, do.call,
##     duplicated, eval, evalq, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, rank, rbind, rowMeans,
##     rowSums, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: DelayedArray

## Loading required package: matrixStats

## Warning: package 'matrixStats' was built under R version 3.4.3

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## 
## Attaching package: 'DelayedArray'

## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

## The following object is masked from 'package:base':
## 
##     apply

countRawDeseq2 = DESeqDataSetFromMatrix(countData=as.matrix(countData[,-1])
                                        , colData=metaData[, c('file', 'condition')]
                                        , design=~condition)
### apply vst transformation
vstData = vst(countRawDeseq2)

After vst transformation, the mean-variance relationship is being stablized.

### explore the mean-variance stablization after vst
if(!'vsn'%in%rownames(installed.packages())){source("https://bioconductor.org/biocLite.R"); biocLite("vsn")}
vsn::meanSdPlot(assay(vstData))

## Warning: package 'hexbin' was built under R version 3.4.3

plot of chunk unnamed-chunk-12

Apply brt to the stablized data, with range=c(-0.2,0.2)

### extract the transformed data
dta_vst = data.frame(gene_id=countData[,1], assay(vstData))
### apply brt to compare the treatment effect
countResult = do.call(rbind
                      , lapply(1:nrow(dta_vst), function(i){
                        tmp1 = brt.test(x=dta_vst[i,c(2:5)], y=dta_vst[i,c(6:8)], hi=0.2)
                        out = data.frame(gene=dta_vst[i,1]
                             , est_diff=tmp1$mu_y-tmp1$mu_x
                             , brt.test=tmp1$brt.pvalue
                             )
                      }))

Plot the histogram of the raw.brt.value

hist(countResult$brt.test)

plot of chunk unnamed-chunk-14