library(prozor)
#library(reshape2)

rm(list=ls())

file = system.file("extdata/IDResults.txt.gz" , package = "prozor")
specMeta <- readr::read_tsv(file)
## Parsed with column specification:
## cols(
##   RefSpectraId = col_integer(),
##   numPeaks = col_integer(),
##   peptideSeq = col_character(),
##   precursorCharge = col_integer(),
##   precursorMZ = col_double(),
##   retentionTime = col_double(),
##   copies = col_integer(),
##   peptideModSeq = col_character(),
##   score = col_double(),
##   lengthPepSeq = col_integer(),
##   fileName = col_integer(),
##   SpecIDinFile = col_integer()
## )
nrow(specMeta)
## [1] 5000
hist(specMeta$score, breaks=100)

Annotate peptide sequences with protein sequences

length(unique(specMeta$peptideSeq))
## [1] 1520
upeptide <-unique(specMeta$peptideSeq)

resAll <- prozor::readPeptideFasta(system.file("extdata/Annotation_allSeq.fasta.gz" , package = "prozor"))
resRev <- prozor::readPeptideFasta(system.file("extdata/Annotation_canSeq.fasta.gz" , package = "prozor"))

annotAll <- prozor::annotatePeptides(upeptide,resAll)
## [1] 4953
pcAll <- length(unique(annotAll$peptideSeq))/ length(upeptide)

annotRev <- prozor::annotatePeptides(upeptide, resRev)
## [1] 1770
pcCan <-length(unique(annotRev$peptideSeq))/ length(upeptide)

barplot(c(Canonical = pcCan, All = pcAll))

par(mfrow=c(1,3))
plot(sort(table(annotAll$peptideSeq)),axes=F, ylab="Nr protein IDs")
axis(2)
PCProteotypic_all <- sum(table(annotAll$peptideSeq) == 1)/ length(table(annotAll$peptideSeq)) * 100

#plot(sort(table(annotIso$peptideSeq)),axes=F, ylab="Nr protein IDs")
#axis(2)
#PCProteotypic_iso <- sum(table(annotIso$peptideSeq) == 1)/ length(table(annotIso$peptideSeq)) * 100

plot(sort(table(annotRev$peptideSeq)),axes=F, ylab="Nr protein IDs")
axis(2)

PCProteotypic_canonical <- sum(table(annotRev$peptideSeq) == 1)/ length(table(annotRev$peptideSeq)) * 100

barplot(c(All = PCProteotypic_all, canonical =  PCProteotypic_canonical),las=2, ylab="% proteotypic" )

Do protein inference

library(Matrix)
precursors <- unique(subset(specMeta,select = c(peptideModSeq,precursorCharge,peptideSeq )))

For trembl and swissprot

Annotate the precursors with protein ID’s

annotatedPrecursors <-merge(precursors ,
                            subset(annotAll, select= c(peptideSeq,proteinID)),
                            by.x="peptideSeq", 
                            by.y="peptideSeq")

annotatedPrecursors$precursorCharge <- annotatedPrecursors$precursorCharge
annotatedPrecursors$peptideModSeq <- annotatedPrecursors$peptideModSeq
head(annotatedPrecursors)
##             peptideSeq                      peptideModSeq precursorCharge
## 1 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 2 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 3 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 4 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 5 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
## 6 AACAQLNDFLQEYGTQGCQV AAC[+57.0]AQLNDFLQEYGTQGC[+57.0]QV               3
##                        proteinID
## 1         tr|F5GXS0|F5GXS0_HUMAN
## 2 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN
## 3 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN
## 4 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN
## 5 tr|A0A140TA32|A0A140TA32_HUMAN
## 6 tr|A0A0G2JPR0|A0A0G2JPR0_HUMAN
xx<-prepareMatrix(annotatedPrecursors,proteinID = "proteinID", peptideID = "peptideSeq")

library(Matrix)
image(xx)

xxAll <- greedy(xx)

For swissprot no isoforms

annotatedPrecursors <-merge(precursors , subset(annotRev, select= c(peptideSeq,proteinID)), by.x="peptideSeq", 
                            by.y="peptideSeq")

annotatedPrecursors$precursorCharge <- annotatedPrecursors$precursorCharge
annotatedPrecursors$peptideModSeq <- annotatedPrecursors$peptideModSeq

xx<-prepareMatrix(annotatedPrecursors ,proteinID = "proteinID", peptideID = "peptideSeq")
image(xx)

xxCAN <- greedy(xx)
barplot(c(All = length(unique(unlist(xxAll))) , canonical = length(unique(unlist(xxCAN)))    ))
Number of Proteins after protein inference.

Number of Proteins after protein inference.

TODO

Protein Grouping and Clustering in Scaffold