library(prozor)
#library(reshape2)

rm(list=ls())

file = system.file("extdata/IDResults.txt.gz" , package = "prozor")
specMeta <- readr::read_tsv(file)

## Parsed with column specification:
## cols(
##   RefSpectraId = col_integer(),
##   numPeaks = col_integer(),
##   peptideSeq = col_character(),
##   precursorCharge = col_integer(),
##   precursorMZ = col_double(),
##   retentionTime = col_double(),
##   copies = col_integer(),
##   peptideModSeq = col_character(),
##   score = col_double(),
##   lengthPepSeq = col_integer(),
##   fileName = col_integer(),
##   SpecIDinFile = col_integer()
## )

nrow(specMeta)

## [1] 5000

hist(specMeta$score, breaks=100)

Annotate peptide sequences with protein sequences

length(unique(specMeta$peptideSeq))

## [1] 1520

upeptide <-unique(specMeta$peptideSeq)

resAll <- prozor::readPeptideFasta(system.file("extdata/Annotation_allSeq.fasta.gz" , package = "prozor"))
resRev <- prozor::readPeptideFasta(system.file("extdata/Annotation_canSeq.fasta.gz" , package = "prozor"))

annotAll <- prozor::annotatePeptides(upeptide,resAll)

## [1] 4953

pcAll <- length(unique(annotAll$peptideSeq))/ length(upeptide)

annotRev <- prozor::annotatePeptides(upeptide, resRev)

## [1] 1770

pcCan <-length(unique(annotRev$peptideSeq))/ length(upeptide)

barplot(c(Canonical = pcCan, All = pcAll))

par(mfrow=c(1,3))
plot(sort(table(annotAll$peptideSeq)),axes=F, ylab="Nr protein IDs")
axis(2)
PCProteotypic_all <- sum(table(annotAll$peptideSeq) == 1)/ length(table(annotAll$peptideSeq)) * 100

#plot(sort(table(annotIso$peptideSeq)),axes=F, ylab="Nr protein IDs")
#axis(2)
#PCProteotypic_iso <- sum(table(annotIso$peptideSeq) == 1)/ length(table(annotIso$peptideSeq)) * 100

plot(sort(table(annotRev$peptideSeq)),axes=F, ylab="Nr protein IDs")
axis(2)

PCProteotypic_canonical <- sum(table(annotRev$peptideSeq) == 1)/ length(table(annotRev$peptideSeq)) * 100

barplot(c(All = PCProteotypic_all, canonical =  PCProteotypic_canonical),las=2, ylab="% proteotypic" )

Do protein inference

library(Matrix)
precursors <- unique(subset(specMeta,select = c(peptideModSeq,precursorCharge,peptideSeq )))

For trembl and swissprot

Annotate the precursors with protein ID’s

annotatedPrecursors <-merge(precursors ,
                            subset(annotAll, select= c(peptideSeq,proteinID)),
                            by.x="peptideSeq", 
                            by.y="peptideSeq")

annotatedPrecursors$precursorCharge <- annotatedPrecursors$precursorCharge
annotatedPrecursors$peptideModSeq <- annotatedPrecursors$peptideModSeq
head(annotatedPrecursors)

##             peptideSeq                      peptideModSeq precursorCharge
## 1 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 2 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 3 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 4 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 5 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 6 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
##                        proteinID
## 1         tr|F5GXS0|F5GXS0_HUMAN
## 2 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN
## 3 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN
## 4 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN
## 5 tr|A0A140TA32|A0A140TA32_HUMAN
## 6 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN

xx<-prepareMatrix(annotatedPrecursors,proteinID = "proteinID", peptideID = "peptideSeq")

library(Matrix)
image(xx)

xxAll <- greedy(xx)

For swissprot no isoforms

annotatedPrecursors <-merge(precursors , subset(annotRev, select= c(peptideSeq,proteinID)), by.x="peptideSeq", 
                            by.y="peptideSeq")

annotatedPrecursors$precursorCharge <- annotatedPrecursors$precursorCharge
annotatedPrecursors$peptideModSeq <- annotatedPrecursors$peptideModSeq

xx<-prepareMatrix(annotatedPrecursors ,proteinID = "proteinID", peptideID = "peptideSeq")
image(xx)

xxCAN <- greedy(xx)

barplot(c(All = length(unique(unlist(xxAll))) , canonical = length(unique(unlist(xxCAN)))    ))

Number of Proteins after protein inference.

TODO

Protein Grouping and Clustering in Scaffold

Peptide Annotation and Protein Inference

Witold Wolski

March 16, 2017

Do protein inference

For trembl and swissprot

For swissprot no isoforms

TODO