Download sequences

Genbank

tmpgb <- tempfile(fileext = '.gb')
tmpfa <- tempfile(fileext = '.fa')
download_genbank(acc='AB115403', format='genbank', outfile=tmpgb)
download_genbank(acc='AB115403', format='fasta', outfile=tmpfa)
readLines(tmpgb)[1:10]
##  [1] "LOCUS       AB115403                 561 bp    RNA     linear   VRL 31-JUL-2004"
##  [2] "DEFINITION  Transmissible gastroenteritis virus ORF 7 gene for protein 7,"      
##  [3] "            complete cds, strain:h-5."                                          
##  [4] "ACCESSION   AB115403"                                                           
##  [5] "VERSION     AB115403.1"                                                         
##  [6] "KEYWORDS    ."                                                                  
##  [7] "SOURCE      Transmissible gastroenteritis virus"                                
##  [8] "  ORGANISM  Transmissible gastroenteritis virus"                                
##  [9] "            Viruses; Riboviria; Nidovirales; Cornidovirineae; Coronaviridae;"   
## [10] "            Orthocoronavirinae; Alphacoronavirus; Tegacovirus."
readLines(tmpfa)
##  [1] ">AB115403.1 Transmissible gastroenteritis virus ORF 7 gene for protein 7, complete cds, strain:h-5"
##  [2] "GTTTGATGACACACAGGTTGAGATAATTGATGAGGTAACGAACTAAACGAGATGCTCGTCCTCCTCCATG"                            
##  [3] "CTGTATTTATTACAGTTTTAATCTTACTACTAATTGGTAGACTCCAATTATTAGAAAGATTATTACTTAA"                            
##  [4] "TCACTCTTTCAATCTTAAAACTGTTAATGATTTTAATATCTTATATAGGAGTTTAGCAGAAATCAGATTA"                            
##  [5] "CTAAAAGTGTTGCTTCGATTAATCTTTCTAGTTTTACTAGGATTTTGCTGCTATAGATTGTTAGTTATAT"                            
##  [6] "TAATGTAAGGCAACCCGATGTCTAAAACTGGTTTTTCCGAGGAATTACTGGTCATCGCGCTGTCTACTCT"                            
##  [7] "TGTACAGAATGGTAAGCACGTGTAATAGGAGGTACAAGCAACCCTATTGCATATTAGGAAGTTTAGATTT"                            
##  [8] "GATTTGGCAATGCTAGATTTAGTAATTTAGAGAAGTTTAAAGATCCGCTACGACGAGCCAACAATGGAAG"                            
##  [9] "AGCTAACGTCTGGATCTAGTGATTGTTTAAAATGTAAAATTGTTTGAAAATTTTCCTTTTGATAGTGATA"                            
## [10] "C"                                                                                                 
## [11] ""

File conversion

fasta and phylip conversion

fa_file <- system.file("extdata/HA.fas", package="seqmagick")
## use the small subset to save compilation time of the vignette
fa2 <- tempfile(fileext = '.fa')
fa_read(fa_file) %>% bs_filter('ATGAAAGTAAAA', by='sequence') %>% fa_write(fa2, type='interleaved')


alnfas <- tempfile(fileext = ".fas")
fa_read(fa2) %>% bs_aln(quiet=TRUE) %>% fa_write(alnfas)

## phylip format is only for aligned sequences
tmpphy <- tempfile(fileext = ".phy")
fas2phy(alnfas, tmpphy, type = 'sequential')

seqmagick supports both sequential and interleaved formats, users can specify the format by type parameter.

phy2fas(tmpphy, alnfas, type = 'interleaved')

interleaved and sequential format conversion

tmpfas <- tempfile(fileext='.fa')
fa_read(fa2) %>% fa_write(tmpfas, type="sequential")
tmpphy2 <- tempfile(fileext = '.phy')
phy_read(tmpphy) %>% phy_write(tmpphy2, type="interleaved")

Sequence manipulation

bs <- fa_read(fa_file)
bs_filter(bs, 'ATGAAAGTAAAA', by='sequence')

aln <- bs_filter(bs, 'ATGAAAGTAAAA', by='sequence') %>% bs_aln(quiet=TRUE)

bs_consensus(aln)

Bugs/Feature requests

If you have any, let me know. Thx!

Session info

Here is the output of sessionInfo() on the system on which this document was compiled:

## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Arch Linux
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas.so.3.9.0
## LAPACK: /usr/lib/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
## [1] seqmagick_0.1.3     Biostrings_2.54.0   XVector_0.26.0     
## [4] IRanges_2.20.1      S4Vectors_0.24.1    BiocGenerics_0.32.0
## [7] magrittr_1.5       
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3      digest_0.6.23   evaluate_0.14   zlibbioc_1.32.0
##  [5] rlang_0.4.2     stringi_1.4.3   rmarkdown_2.0   tools_3.6.2    
##  [9] downloader_0.4  stringr_1.4.0   prettydoc_0.3.1 xfun_0.11      
## [13] yaml_2.2.0      compiler_3.6.2  htmltools_0.4.0 knitr_1.26

References