library("quanteda.textmodels")
library("quanteda")
## Package version: 2.0.0.9000
## Parallel computing: 2 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:utils':
##
## View
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews
load(url("https://www.dropbox.com/s/sjdfmx8ggwfda5o/data_corpus_LMRD.rda?dl=1"))
dfmat <- dfm(data_corpus_LMRD)
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")
Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 87.89121 92.07464 106.4365 94.40395 102.896 318.8454 50
## bern 98.64426 103.42245 123.7836 112.32778 147.595 177.6993 50
And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 109.6197 114.9879 131.4237 118.6746 127.0515 241.4961 50
## bern 160.2917 169.6258 202.2621 200.4712 221.1150 467.9268 50
Now let’s see how textmodel_nb()
compares to equivalent functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 0.9.7 loaded
microbenchmark(
textmodels = {
tmod <- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
naivebayes = {
tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 108.7307 114.5717 135.7260 121.6727 151.5379 263.4989 50
## fastNaiveBayes 237.7174 269.9217 308.8950 294.1616 334.4319 468.6609 50
## naivebayes 185.0392 191.5954 234.1745 203.1722 248.6193 405.7980 50
And Bernoulli. Note here that while we are supplying the boolean matrix to textmodel_nb()
, this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it’s done here just for comparison.
dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")
microbenchmark(
textmodels = {
tmod <- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
naivebayes = {
tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
times = 50
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 158.7135 170.4623 198.4379 196.7359 215.5290 387.4112 50
## fastNaiveBayes 265.9222 282.0496 309.9770 304.4984 329.3598 500.0242 50
## naivebayes 205.5777 212.8711 229.3411 218.5097 245.9122 290.4660 50
😎
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.0. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-02-23.