There are several functions for working with sequence data in strataG. They will either take a haploid gtypes object that contains sequences or some format that can be converted to a DNAbin (in the ape package), or multidna (in the apex package) object.
Leading and trailing N’s can be removed from all sequences like this:
library(ape)
data(dolph.seqs)
i <- sample(1:10, 1)
j <- sample(1:10, 1)
x <- c(rep("n", i), dolph.seqs[[1]], rep("n", j))
x
## [1] "n" "n" "n" "n" "n" "n" "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a"
## [19] "t" "t" "g" "t" "a" "c" "a" "g" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c"
## [37] "a" "t" "c" "a" "c" "a" "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g"
## [55] "t" "a" "t" "t" "a" "a" "a" "a" "g" "t" "a" "a" "t" "t" "t" "g" "t" "t"
## [73] "t" "t" "a" "a" "a" "a" "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t"
## [91] "a" "c" "a" "c" "a" "t" "t" "a" "c" "a" "t" "a" "t" "a" "c" "a" "t" "a"
## [109] "c" "a" "c" "a" "t" "g" "t" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t" "a"
## [127] "t" "t" "t" "a" "g" "t" "c" "-" "t" "c" "t" "c" "c" "t" "t" "g" "t" "a"
## [145] "a" "a" "t" "a" "t" "t" "c" "a" "t" "a" "c" "a" "t" "a" "c" "a" "t" "g"
## [163] "c" "t" "a" "t" "g" "t" "a" "t" "t" "a" "t" "t" "g" "t" "g" "c" "a" "t"
## [181] "t" "c" "a" "t" "t" "t" "a" "t" "t" "t" "t" "c" "c" "a" "t" "a" "c" "g"
## [199] "a" "t" "a" "a" "g" "t" "t" "a" "a" "a" "g" "c" "c" "c" "g" "t" "a" "t"
## [217] "t" "a" "a" "t" "t" "a" "t" "c" "a" "t" "t" "a" "a" "t" "t" "t" "t" "a"
## [235] "c" "a" "t" "a" "t" "t" "a" "c" "a" "t" "a" "a" "t" "a" "t" "g" "c" "a"
## [253] "t" "g" "c" "t" "c" "t" "t" "a" "c" "a" "t" "a" "t" "t" "a" "t" "a" "t"
## [271] "c" "t" "c" "c" "c" "c" "t" "a" "t" "c" "a" "a" "t" "t" "t" "c" "a" "c"
## [289] "c" "t" "c" "c" "a" "t" "t" "a" "t" "a" "c" "c" "c" "t" "a" "t" "g" "g"
## [307] "t" "c" "a" "c" "t" "c" "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g"
## [325] "a" "g" "c" "t" "t" "a" "a" "t" "c" "a" "c" "c" "a" "t" "g" "c" "c" "g"
## [343] "c" "g" "t" "g" "a" "a" "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g"
## [361] "c" "t" "c" "g" "g" "c" "a" "g" "g" "g" "a" "t" "c" "c" "c" "t" "c" "t"
## [379] "t" "c" "t" "c" "g" "c" "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "a"
## [397] "t" "c" "t" "c" "g" "t" "g" "g" "g" "g" "g" "t" "n" "n" "n" "n" "n"
## [[1]]
## [1] "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c"
## [19] "a" "g" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c" "a" "t" "c" "a" "c" "a"
## [37] "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g" "t" "a" "t" "t" "a" "a"
## [55] "a" "a" "g" "t" "a" "a" "t" "t" "t" "g" "t" "t" "t" "t" "a" "a" "a" "a"
## [73] "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t" "a" "c" "a" "c" "a" "t"
## [91] "t" "a" "c" "a" "t" "a" "t" "a" "c" "a" "t" "a" "c" "a" "c" "a" "t" "g"
## [109] "t" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t" "a" "t" "t" "t" "a" "g" "t"
## [127] "c" "-" "t" "c" "t" "c" "c" "t" "t" "g" "t" "a" "a" "a" "t" "a" "t" "t"
## [145] "c" "a" "t" "a" "c" "a" "t" "a" "c" "a" "t" "g" "c" "t" "a" "t" "g" "t"
## [163] "a" "t" "t" "a" "t" "t" "g" "t" "g" "c" "a" "t" "t" "c" "a" "t" "t" "t"
## [181] "a" "t" "t" "t" "t" "c" "c" "a" "t" "a" "c" "g" "a" "t" "a" "a" "g" "t"
## [199] "t" "a" "a" "a" "g" "c" "c" "c" "g" "t" "a" "t" "t" "a" "a" "t" "t" "a"
## [217] "t" "c" "a" "t" "t" "a" "a" "t" "t" "t" "t" "a" "c" "a" "t" "a" "t" "t"
## [235] "a" "c" "a" "t" "a" "a" "t" "a" "t" "g" "c" "a" "t" "g" "c" "t" "c" "t"
## [253] "t" "a" "c" "a" "t" "a" "t" "t" "a" "t" "a" "t" "c" "t" "c" "c" "c" "c"
## [271] "t" "a" "t" "c" "a" "a" "t" "t" "t" "c" "a" "c" "c" "t" "c" "c" "a" "t"
## [289] "t" "a" "t" "a" "c" "c" "c" "t" "a" "t" "g" "g" "t" "c" "a" "c" "t" "c"
## [307] "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g" "a" "g" "c" "t" "t" "a"
## [325] "a" "t" "c" "a" "c" "c" "a" "t" "g" "c" "c" "g" "c" "g" "t" "g" "a" "a"
## [343] "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g" "c" "t" "c" "g" "g" "c"
## [361] "a" "g" "g" "g" "a" "t" "c" "c" "c" "t" "c" "t" "t" "c" "t" "c" "g" "c"
## [379] "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "a" "t" "c" "t" "c" "g" "t"
## [397] "g" "g" "g" "g" "g" "t"
Base frequencies for a sequence are calculated with the baseFreqs function:
## 1 2 3 4 5 6 7 8
## a 0 126 126 126 126 126 5 0
## c 0 0 0 0 0 0 0 0
## g 126 0 0 0 0 0 0 126
## t 0 0 0 0 0 0 0 0
## u 0 0 0 0 0 0 0 0
## r 0 0 0 0 0 0 0 0
## y 0 0 0 0 0 0 0 0
## m 0 0 0 0 0 0 0 0
## k 0 0 0 0 0 0 0 0
## w 0 0 0 0 0 0 0 0
## s 0 0 0 0 0 0 0 0
## b 0 0 0 0 0 0 0 0
## d 0 0 0 0 0 0 0 0
## h 0 0 0 0 0 0 0 0
## v 0 0 0 0 0 0 0 0
## n 0 0 0 0 0 0 0 0
## x 0 0 0 0 0 0 0 0
## - 0 0 0 0 0 0 121 0
## . 0 0 0 0 0 0 0 0
##
## a c g t u r y m k w s b d
## 15179 11561 6501 17166 0 0 0 0 0 0 0 0 0
## h v n x - .
## 0 0 0 0 245 0
One can also identify which sites are fixed and which are variable:
## 1 2 3 4 5 6 8 9 10 11 12 13 14 15 16 17 18 19 21 22
## "g" "a" "a" "a" "a" "a" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c" "a" "t" "t"
## $sites
## 126 DNA sequences in binary format stored in a matrix.
##
## All sequences of same length: 43
##
## Labels:
## 4495
## 4496
## 4498
## 5814
## 5815
## 5816
## ...
##
## Base composition:
## a c g t
## 0.206 0.359 0.088 0.347
## (Total: 5.4 kb)
##
## $site.freqs
## 7 20 32 57 92 97 99 101 104 106 109 128 149 150 151 205 245 248 265
## a 5 2 0 1 124 0 0 0 125 124 0 0 0 124 0 0 0 2 2
## c 0 0 10 0 0 7 115 6 0 0 12 0 112 0 2 102 114 0 31
## g 0 124 0 125 2 0 0 0 1 2 0 0 0 2 0 0 0 124 0
## t 0 0 116 0 0 119 11 120 0 0 114 2 14 0 124 24 12 0 93
## - 121 0 0 0 0 0 0 0 0 0 0 124 0 0 0 0 0 0 0
## 269 272 274 275 278 279 280 282 283 287 293 294 302 303 305 329 357 370 373
## a 0 123 1 123 0 0 0 0 0 125 0 0 0 78 0 0 0 0 0
## c 99 0 124 0 10 14 2 98 97 0 84 97 124 0 4 112 124 77 4
## g 0 3 0 3 0 0 0 0 0 1 0 0 0 48 0 0 0 0 0
## t 27 0 1 0 116 112 124 28 29 0 42 29 2 0 122 14 2 49 122
## - 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 390 391 392 393 394
## a 108 0 0 0 0
## c 0 27 125 1 125
## g 18 0 0 0 0
## t 0 99 1 125 1
## - 0 0 0 0 0
Both functions take an optional set of bases to consider when evaluating whether a site is fixed or variable. For fixedSites, the function will only count those sites that are fixed in the listed bases argument. For variableSites the site is considered variable if it has those bases and is not fixed for them:
## 9 10 11 13 14 16 18 21 22 24 25 27 30 33 35 38 40 41 43 45
## "c" "t" "t" "t" "t" "t" "c" "t" "t" "c" "c" "c" "c" "c" "c" "t" "c" "t" "c" "t"
## $sites
## 126 DNA sequences in binary format stored in a matrix.
##
## All sequences of same length: 29
##
## Labels:
## 4495
## 4496
## 4498
## 5814
## 5815
## 5816
## ...
##
## Base composition:
## a c g t
## 0.001 0.509 0.000 0.490
## (Total: 3.6 kb)
##
## $site.freqs
## 32 97 99 101 109 149 151 205 245 265 269 274 278 279 280 282 283 293 294
## c 10 7 115 6 12 112 2 102 114 31 99 124 10 14 2 98 97 84 97
## t 116 119 11 120 114 14 124 24 12 93 27 1 116 112 124 28 29 42 29
## 302 305 329 357 370 373 391 392 393 394
## c 124 4 112 124 77 4 27 125 1 125
## t 2 122 14 2 49 122 99 1 125 1
There are also functions to compare bases against IUPAC ambiguity codes. One can calculate the appropriate IUPAC code for a vector of nucleotides:
## [1] "y"
## [1] "h"
## [1] "n"
One can also calculate all IUPAC codes that apply to a vector of nucleotides:
## [1] "y" "b" "h" "n" "x" "-" "."
## [1] "h" "n" "x" "-" "."
## [1] "n" "x" "-" "."
A consensus sequence can also be easily generated:
## [1] "g" "a" "a" "a" "a" "a" "-" "g" "c" "t" "t" "a" "t" "t" "g" "t" "a" "c"
## [19] "a" "r" "t" "t" "a" "c" "c" "a" "c" "a" "a" "c" "a" "y" "c" "a" "c" "a"
## [37] "g" "t" "a" "c" "t" "a" "c" "g" "t" "c" "a" "g" "t" "a" "t" "t" "a" "a"
## [55] "a" "a" "r" "t" "a" "a" "t" "t" "t" "g" "t" "t" "t" "t" "a" "a" "a" "a"
## [73] "a" "c" "a" "t" "t" "t" "t" "a" "c" "t" "g" "t" "a" "c" "a" "c" "a" "t"
## [91] "t" "r" "c" "a" "t" "a" "y" "a" "y" "a" "y" "a" "c" "r" "c" "r" "t" "g"
## [109] "y" "g" "c" "a" "t" "g" "c" "t" "a" "a" "t" "a" "t" "t" "t" "a" "g" "t"
## [127] "c" "-" "t" "c" "t" "c" "c" "t" "t" "g" "t" "a" "a" "a" "t" "a" "t" "t"
## [145] "c" "a" "t" "a" "y" "r" "y" "a" "c" "a" "t" "g" "c" "t" "a" "t" "g" "t"
## [163] "a" "t" "t" "a" "t" "t" "g" "t" "g" "c" "a" "t" "t" "c" "a" "t" "t" "t"
## [181] "a" "t" "t" "t" "t" "c" "c" "a" "t" "a" "c" "g" "a" "t" "a" "a" "g" "t"
## [199] "t" "a" "a" "a" "g" "c" "y" "c" "g" "t" "a" "t" "t" "a" "a" "t" "t" "a"
## [217] "t" "c" "a" "t" "t" "a" "a" "t" "t" "t" "t" "a" "c" "a" "t" "a" "t" "t"
## [235] "a" "c" "a" "t" "a" "a" "t" "a" "t" "g" "y" "a" "t" "r" "c" "t" "c" "t"
## [253] "t" "a" "c" "a" "t" "a" "t" "t" "a" "t" "a" "t" "h" "t" "c" "c" "y" "c"
## [271] "t" "r" "t" "h" "r" "a" "t" "y" "y" "y" "a" "y" "y" "t" "c" "c" "r" "t"
## [289] "t" "a" "t" "a" "y" "y" "c" "t" "a" "t" "g" "g" "t" "y" "r" "c" "y" "c"
## [307] "c" "a" "t" "t" "a" "g" "a" "t" "c" "a" "c" "g" "a" "g" "c" "t" "t" "a"
## [325] "a" "t" "c" "a" "y" "c" "a" "t" "g" "c" "c" "g" "c" "g" "t" "g" "a" "a"
## [343] "a" "c" "c" "a" "g" "c" "a" "a" "c" "c" "c" "g" "c" "t" "y" "g" "g" "c"
## [361] "a" "g" "g" "g" "a" "t" "c" "c" "c" "y" "c" "t" "y" "c" "t" "c" "g" "c"
## [379] "a" "c" "c" "g" "g" "g" "c" "c" "c" "a" "t" "r" "y" "y" "y" "y" "g" "t"
## [397] "g" "g" "g" "g" "g" "t"
Nucleotide diversity for each site is calculaed with:
## 1 2 3 4 5 6
## 0 0 0 0 0 0
For a stratified gtypes object, one can calculate net nucleotide divergence (Nei’s dA), and distributions of between- and within-strata divergence:
# create gtypes
data(dolph.seqs)
data(dolph.strata)
rownames(dolph.strata) <- dolph.strata$id
dloop <- df2gtypes(dolph.strata[, c("id", "fine", "id")], ploidy = 1,
schemes = dolph.strata[, c("fine", "broad")], sequences = dolph.seqs)
dloop <- labelHaplotypes(dloop, "Hap.")
# calculate divergence
nucleotideDivergence(dloop)
## $within
## locus stratum mean q.0 q.0.025 q.0.5 q.0.975 q.1
## 1 id.1 Coastal 0.0051 0 0 0.005 0.010 0.010
## 2 id.1 Offshore.North 0.0227 0 0 0.022 0.037 0.050
## 3 id.1 Offshore.South 0.0187 0 0 0.018 0.038 0.043
##
## $between
## locus strata.1 strata.2 dA mean q.0 q.0.025 q.0.5 q.0.975
## 1 id.1 Coastal Offshore.North 0.0061 0.020 0.0000 0.0050 0.020 0.033
## 2 id.1 Coastal Offshore.South 0.0062 0.018 0.0075 0.0075 0.018 0.033
## 3 id.1 Offshore.North Offshore.South 0.0011 0.022 0.0000 0.0000 0.022 0.040
## q.1
## 1 0.037
## 2 0.037
## 3 0.048
For stratified gtypes, one can also identify fixed differences between strata:
## $sites
## $sites$`Coastal v. Offshore.North`
##
## Coastal
## Offshore.North
##
## $sites$`Coastal v. Offshore.South`
##
## Coastal
## Offshore.South
##
## $sites$`Offshore.North v. Offshore.South`
##
## Offshore.North
## Offshore.South
##
##
## $num.fixed
## strata.1 strata.2 num.fixed
## 1 Coastal Offshore.North 0
## 2 Coastal Offshore.South 0
## 3 Offshore.North Offshore.South 0
Two functions have been provided to select a subset of representative sequences. The first selects the most distant sequences in order to capture the full distribution of variation. For example:
## [1] "74962" "6290" "4498" "18652" "5815"
The other function selects the most representative sequences by first clustering the sequences and selecting the sequences closest to the center of each cluster:
## [1] "18655" "26305" "26316" "41578" "78044"