Working With Sequences

Eric Archer

2020-02-23

There are several functions for working with sequence data in strataG. They will either take a haploid gtypes object that contains sequences or some format that can be converted to a DNAbin (in the ape package), or multidna (in the apex package) object.

Leading and trailing N’s can be removed from all sequences like this:

library(ape)
data(dolph.seqs)

i <- sample(1:10, 1)
j <- sample(1:10, 1)
x <- c(rep("n", i), dolph.seqs[[1]], rep("n", j))
x

##   [1] "n" "n" "n" "n" "n" "n" "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a"
##  [19] "t" "t" "g" "t" "a" "c" "a" "g" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c"
##  [37] "a" "t" "c" "a" "c" "a" "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g"
##  [55] "t" "a" "t" "t" "a" "a" "a" "a" "g" "t" "a" "a" "t" "t" "t" "g" "t" "t"
##  [73] "t" "t" "a" "a" "a" "a" "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t"
##  [91] "a" "c" "a" "c" "a" "t" "t" "a" "c" "a" "t" "a" "t" "a" "c" "a" "t" "a"
## [109] "c" "a" "c" "a" "t" "g" "t" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t" "a"
## [127] "t" "t" "t" "a" "g" "t" "c" "-" "t" "c" "t" "c" "c" "t" "t" "g" "t" "a"
## [145] "a" "a" "t" "a" "t" "t" "c" "a" "t" "a" "c" "a" "t" "a" "c" "a" "t" "g"
## [163] "c" "t" "a" "t" "g" "t" "a" "t" "t" "a" "t" "t" "g" "t" "g" "c" "a" "t"
## [181] "t" "c" "a" "t" "t" "t" "a" "t" "t" "t" "t" "c" "c" "a" "t" "a" "c" "g"
## [199] "a" "t" "a" "a" "g" "t" "t" "a" "a" "a" "g" "c" "c" "c" "g" "t" "a" "t"
## [217] "t" "a" "a" "t" "t" "a" "t" "c" "a" "t" "t" "a" "a" "t" "t" "t" "t" "a"
## [235] "c" "a" "t" "a" "t" "t" "a" "c" "a" "t" "a" "a" "t" "a" "t" "g" "c" "a"
## [253] "t" "g" "c" "t" "c" "t" "t" "a" "c" "a" "t" "a" "t" "t" "a" "t" "a" "t"
## [271] "c" "t" "c" "c" "c" "c" "t" "a" "t" "c" "a" "a" "t" "t" "t" "c" "a" "c"
## [289] "c" "t" "c" "c" "a" "t" "t" "a" "t" "a" "c" "c" "c" "t" "a" "t" "g" "g"
## [307] "t" "c" "a" "c" "t" "c" "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g"
## [325] "a" "g" "c" "t" "t" "a" "a" "t" "c" "a" "c" "c" "a" "t" "g" "c" "c" "g"
## [343] "c" "g" "t" "g" "a" "a" "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g"
## [361] "c" "t" "c" "g" "g" "c" "a" "g" "g" "g" "a" "t" "c" "c" "c" "t" "c" "t"
## [379] "t" "c" "t" "c" "g" "c" "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "a"
## [397] "t" "c" "t" "c" "g" "t" "g" "g" "g" "g" "g" "t" "n" "n" "n" "n" "n"

x.trimmed <- trimNs(as.DNAbin(x))
as.character(as.list(x.trimmed))

## [[1]]
##   [1] "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c"
##  [19] "a" "g" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c" "a" "t" "c" "a" "c" "a"
##  [37] "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g" "t" "a" "t" "t" "a" "a"
##  [55] "a" "a" "g" "t" "a" "a" "t" "t" "t" "g" "t" "t" "t" "t" "a" "a" "a" "a"
##  [73] "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t" "a" "c" "a" "c" "a" "t"
##  [91] "t" "a" "c" "a" "t" "a" "t" "a" "c" "a" "t" "a" "c" "a" "c" "a" "t" "g"
## [109] "t" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t" "a" "t" "t" "t" "a" "g" "t"
## [127] "c" "-" "t" "c" "t" "c" "c" "t" "t" "g" "t" "a" "a" "a" "t" "a" "t" "t"
## [145] "c" "a" "t" "a" "c" "a" "t" "a" "c" "a" "t" "g" "c" "t" "a" "t" "g" "t"
## [163] "a" "t" "t" "a" "t" "t" "g" "t" "g" "c" "a" "t" "t" "c" "a" "t" "t" "t"
## [181] "a" "t" "t" "t" "t" "c" "c" "a" "t" "a" "c" "g" "a" "t" "a" "a" "g" "t"
## [199] "t" "a" "a" "a" "g" "c" "c" "c" "g" "t" "a" "t" "t" "a" "a" "t" "t" "a"
## [217] "t" "c" "a" "t" "t" "a" "a" "t" "t" "t" "t" "a" "c" "a" "t" "a" "t" "t"
## [235] "a" "c" "a" "t" "a" "a" "t" "a" "t" "g" "c" "a" "t" "g" "c" "t" "c" "t"
## [253] "t" "a" "c" "a" "t" "a" "t" "t" "a" "t" "a" "t" "c" "t" "c" "c" "c" "c"
## [271] "t" "a" "t" "c" "a" "a" "t" "t" "t" "c" "a" "c" "c" "t" "c" "c" "a" "t"
## [289] "t" "a" "t" "a" "c" "c" "c" "t" "a" "t" "g" "g" "t" "c" "a" "c" "t" "c"
## [307] "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g" "a" "g" "c" "t" "t" "a"
## [325] "a" "t" "c" "a" "c" "c" "a" "t" "g" "c" "c" "g" "c" "g" "t" "g" "a" "a"
## [343] "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g" "c" "t" "c" "g" "g" "c"
## [361] "a" "g" "g" "g" "a" "t" "c" "c" "c" "t" "c" "t" "t" "c" "t" "c" "g" "c"
## [379] "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "a" "t" "c" "t" "c" "g" "t"
## [397] "g" "g" "g" "g" "g" "t"

Base frequencies for a sequence are calculated with the baseFreqs function:

bf <- baseFreqs(dolph.seqs)
bf$site.freqs[, 1:8]

##     1   2   3   4   5   6   7   8
## a   0 126 126 126 126 126   5   0
## c   0   0   0   0   0   0   0   0
## g 126   0   0   0   0   0   0 126
## t   0   0   0   0   0   0   0   0
## u   0   0   0   0   0   0   0   0
## r   0   0   0   0   0   0   0   0
## y   0   0   0   0   0   0   0   0
## m   0   0   0   0   0   0   0   0
## k   0   0   0   0   0   0   0   0
## w   0   0   0   0   0   0   0   0
## s   0   0   0   0   0   0   0   0
## b   0   0   0   0   0   0   0   0
## d   0   0   0   0   0   0   0   0
## h   0   0   0   0   0   0   0   0
## v   0   0   0   0   0   0   0   0
## n   0   0   0   0   0   0   0   0
## x   0   0   0   0   0   0   0   0
## -   0   0   0   0   0   0 121   0
## .   0   0   0   0   0   0   0   0

bf$base.freqs

## 
##     a     c     g     t     u     r     y     m     k     w     s     b     d 
## 15179 11561  6501 17166     0     0     0     0     0     0     0     0     0 
##     h     v     n     x     -     . 
##     0     0     0     0   245     0

One can also identify which sites are fixed and which are variable:

fs <- fixedSites(dolph.seqs)
fs[1:20]

##   1   2   3   4   5   6   8   9  10  11  12  13  14  15  16  17  18  19  21  22 
## "g" "a" "a" "a" "a" "a" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c" "a" "t" "t"

vs <- variableSites(dolph.seqs)
vs

## $sites
## 126 DNA sequences in binary format stored in a matrix.
## 
## All sequences of same length: 43 
## 
## Labels:
## 4495
## 4496
## 4498
## 5814
## 5815
## 5816
## ...
## 
## Base composition:
##     a     c     g     t 
## 0.206 0.359 0.088 0.347 
## (Total: 5.4 kb)
## 
## $site.freqs
##     7  20  32  57  92  97  99 101 104 106 109 128 149 150 151 205 245 248 265
## a   5   2   0   1 124   0   0   0 125 124   0   0   0 124   0   0   0   2   2
## c   0   0  10   0   0   7 115   6   0   0  12   0 112   0   2 102 114   0  31
## g   0 124   0 125   2   0   0   0   1   2   0   0   0   2   0   0   0 124   0
## t   0   0 116   0   0 119  11 120   0   0 114   2  14   0 124  24  12   0  93
## - 121   0   0   0   0   0   0   0   0   0   0 124   0   0   0   0   0   0   0
##   269 272 274 275 278 279 280 282 283 287 293 294 302 303 305 329 357 370 373
## a   0 123   1 123   0   0   0   0   0 125   0   0   0  78   0   0   0   0   0
## c  99   0 124   0  10  14   2  98  97   0  84  97 124   0   4 112 124  77   4
## g   0   3   0   3   0   0   0   0   0   1   0   0   0  48   0   0   0   0   0
## t  27   0   1   0 116 112 124  28  29   0  42  29   2   0 122  14   2  49 122
## -   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   390 391 392 393 394
## a 108   0   0   0   0
## c   0  27 125   1 125
## g  18   0   0   0   0
## t   0  99   1 125   1
## -   0   0   0   0   0

Both functions take an optional set of bases to consider when evaluating whether a site is fixed or variable. For fixedSites, the function will only count those sites that are fixed in the listed bases argument. For variableSites the site is considered variable if it has those bases and is not fixed for them:

fs <- fixedSites(dolph.seqs, bases = c("c", "t"))
fs[1:20]

##   9  10  11  13  14  16  18  21  22  24  25  27  30  33  35  38  40  41  43  45 
## "c" "t" "t" "t" "t" "t" "c" "t" "t" "c" "c" "c" "c" "c" "c" "t" "c" "t" "c" "t"

vs <- variableSites(dolph.seqs, bases = c("c", "t"))
vs

## $sites
## 126 DNA sequences in binary format stored in a matrix.
## 
## All sequences of same length: 29 
## 
## Labels:
## 4495
## 4496
## 4498
## 5814
## 5815
## 5816
## ...
## 
## Base composition:
##     a     c     g     t 
## 0.001 0.509 0.000 0.490 
## (Total: 3.6 kb)
## 
## $site.freqs
##    32  97  99 101 109 149 151 205 245 265 269 274 278 279 280 282 283 293 294
## c  10   7 115   6  12 112   2 102 114  31  99 124  10  14   2  98  97  84  97
## t 116 119  11 120 114  14 124  24  12  93  27   1 116 112 124  28  29  42  29
##   302 305 329 357 370 373 391 392 393 394
## c 124   4 112 124  77   4  27 125   1 125
## t   2 122  14   2  49 122  99   1 125   1

There are also functions to compare bases against IUPAC ambiguity codes. One can calculate the appropriate IUPAC code for a vector of nucleotides:

iupacCode(c("c", "t", "t", "c", "c"))

## [1] "y"

iupacCode(c("c", "t", "a", "c", "c"))

## [1] "h"

iupacCode(c("g", "t", "a", "c", "c"))

## [1] "n"

One can also calculate all IUPAC codes that apply to a vector of nucleotides:

validIupacCodes(c("c", "t", "t", "c", "c"))

## [1] "y" "b" "h" "n" "x" "-" "."

validIupacCodes(c("c", "t", "a", "c", "c"))

## [1] "h" "n" "x" "-" "."

validIupacCodes(c("g", "t", "a", "c", "c"))

## [1] "n" "x" "-" "."

A consensus sequence can also be easily generated:

createConsensus(dolph.seqs)

##   [1] "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c"
##  [19] "a" "r" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c" "a" "y" "c" "a" "c" "a"
##  [37] "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g" "t" "a" "t" "t" "a" "a"
##  [55] "a" "a" "r" "t" "a" "a" "t" "t" "t" "g" "t" "t" "t" "t" "a" "a" "a" "a"
##  [73] "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t" "a" "c" "a" "c" "a" "t"
##  [91] "t" "r" "c" "a" "t" "a" "y" "a" "y" "a" "y" "a" "c" "r" "c" "r" "t" "g"
## [109] "y" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t" "a" "t" "t" "t" "a" "g" "t"
## [127] "c" "-" "t" "c" "t" "c" "c" "t" "t" "g" "t" "a" "a" "a" "t" "a" "t" "t"
## [145] "c" "a" "t" "a" "y" "r" "y" "a" "c" "a" "t" "g" "c" "t" "a" "t" "g" "t"
## [163] "a" "t" "t" "a" "t" "t" "g" "t" "g" "c" "a" "t" "t" "c" "a" "t" "t" "t"
## [181] "a" "t" "t" "t" "t" "c" "c" "a" "t" "a" "c" "g" "a" "t" "a" "a" "g" "t"
## [199] "t" "a" "a" "a" "g" "c" "y" "c" "g" "t" "a" "t" "t" "a" "a" "t" "t" "a"
## [217] "t" "c" "a" "t" "t" "a" "a" "t" "t" "t" "t" "a" "c" "a" "t" "a" "t" "t"
## [235] "a" "c" "a" "t" "a" "a" "t" "a" "t" "g" "y" "a" "t" "r" "c" "t" "c" "t"
## [253] "t" "a" "c" "a" "t" "a" "t" "t" "a" "t" "a" "t" "h" "t" "c" "c" "y" "c"
## [271] "t" "r" "t" "h" "r" "a" "t" "y" "y" "y" "a" "y" "y" "t" "c" "c" "r" "t"
## [289] "t" "a" "t" "a" "y" "y" "c" "t" "a" "t" "g" "g" "t" "y" "r" "c" "y" "c"
## [307] "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g" "a" "g" "c" "t" "t" "a"
## [325] "a" "t" "c" "a" "y" "c" "a" "t" "g" "c" "c" "g" "c" "g" "t" "g" "a" "a"
## [343] "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g" "c" "t" "y" "g" "g" "c"
## [361] "a" "g" "g" "g" "a" "t" "c" "c" "c" "y" "c" "t" "y" "c" "t" "c" "g" "c"
## [379] "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "r" "y" "y" "y" "y" "g" "t"
## [397] "g" "g" "g" "g" "g" "t"

Nucleotide diversity for each site is calculaed with:

nd <- nucleotideDiversity(dolph.seqs)
head(nd)

## 1 2 3 4 5 6 
## 0 0 0 0 0 0

For a stratified gtypes object, one can calculate net nucleotide divergence (Nei’s dA), and distributions of between- and within-strata divergence:

# create gtypes
data(dolph.seqs)
data(dolph.strata)
rownames(dolph.strata) <- dolph.strata$id
dloop <- df2gtypes(dolph.strata[, c("id", "fine", "id")], ploidy = 1,
             schemes = dolph.strata[, c("fine", "broad")], sequences = dolph.seqs)
dloop <- labelHaplotypes(dloop, "Hap.")

# calculate divergence
nucleotideDivergence(dloop)

## $within
##   locus        stratum   mean q.0 q.0.025 q.0.5 q.0.975   q.1
## 1  id.1        Coastal 0.0051   0       0 0.005   0.010 0.010
## 2  id.1 Offshore.North 0.0227   0       0 0.022   0.037 0.050
## 3  id.1 Offshore.South 0.0187   0       0 0.018   0.038 0.043
## 
## $between
##   locus       strata.1       strata.2     dA  mean    q.0 q.0.025 q.0.5 q.0.975
## 1  id.1        Coastal Offshore.North 0.0061 0.020 0.0000  0.0050 0.020   0.033
## 2  id.1        Coastal Offshore.South 0.0062 0.018 0.0075  0.0075 0.018   0.033
## 3  id.1 Offshore.North Offshore.South 0.0011 0.022 0.0000  0.0000 0.022   0.040
##     q.1
## 1 0.037
## 2 0.037
## 3 0.048

For stratified gtypes, one can also identify fixed differences between strata:

fixedDifferences(dloop)

## $sites
## $sites$`Coastal v. Offshore.North`
##               
## Coastal       
## Offshore.North
## 
## $sites$`Coastal v. Offshore.South`
##               
## Coastal       
## Offshore.South
## 
## $sites$`Offshore.North v. Offshore.South`
##               
## Offshore.North
## Offshore.South
## 
## 
## $num.fixed
##         strata.1       strata.2 num.fixed
## 1        Coastal Offshore.North         0
## 2        Coastal Offshore.South         0
## 3 Offshore.North Offshore.South         0

Two functions have been provided to select a subset of representative sequences. The first selects the most distant sequences in order to capture the full distribution of variation. For example:

x <- as.DNAbin(dolph.seqs)
mostDistantSequences(x, num.seqs = 5)

## [1] "74962" "6290"  "4498"  "18652" "5815"

The other function selects the most representative sequences by first clustering the sequences and selecting the sequences closest to the center of each cluster:

mostRepresentativeSequences(x, num.seqs = 5)

## [1] "18655" "26305" "26316" "41578" "78044"