Cookbook - Using more complex recipes involving text

Emil Hvitfeldt

2020-07-08

Working to get textual data converted into numerical can be done in many different ways. The steps included in textrecipes should hopefully give you the flexibility to perform most of your desired text preprocessing tasks. This vignette will showcase examples that combine multiple steps.

This vignette will not do any modeling with the processed text as its purpose it to showcase the flexibility and modularity. Therefore the only packages needed will be dplyr, recipes and textrecipes. Examples will be performed on the okc_text data-set which is packaged with textrecipes.

library(dplyr)
library(recipes)
library(textrecipes)
library(modeldata)
data("okc_text")

Counting select words

Sometimes it is enough to know the counts of a handful of specific words. This can be easily be achieved by using the arguments custom_stopword_source and keep = TRUE in step_stopwords.

words <- c("you", "i", "sad", "happy")

okc_rec <- recipe(~ ., data = okc_text) %>%
  step_tokenize(essay0) %>%
  step_stopwords(essay0, custom_stopword_source = words, keep = TRUE) %>% 
  step_tf(essay0)

okc_obj <- okc_rec %>%
  prep()
   
bake(okc_obj, okc_text) %>%
  select(starts_with("tf_essay0"))
#> # A tibble: 750 x 4
#>    tf_essay0_happy tf_essay0_i tf_essay0_sad tf_essay0_you
#>              <dbl>       <dbl>         <dbl>         <dbl>
#>  1               0           1             0             3
#>  2               0           1             0             0
#>  3               0          21             0             1
#>  4               1           5             0             0
#>  5               0           3             0             3
#>  6               0           8             0             0
#>  7               0          15             0             5
#>  8               0           7             0             0
#>  9               0           0             0             0
#> 10               0          14             0             1
#> # … with 740 more rows

Removing words in addition to the stop words list

You might know of certain words you don’t want included which isn’t a part of the stop word list of choice. This can easily be done by applying the step_stopwords step twice, once for the stop words and once for your special words.

stopwords_list <- c("was", "she's", "who", "had", "some", "same", "you", "most", 
                    "it's", "they", "for", "i'll", "which", "shan't", "we're", 
                    "such", "more", "with", "there's", "each")

words <- c("sad", "happy")

okc_rec <- recipe(~ ., data = okc_text) %>%
  step_tokenize(essay0) %>%
  step_stopwords(essay0, custom_stopword_source = stopwords_list) %>% 
  step_stopwords(essay0, custom_stopword_source = words) %>% 
  step_tfidf(essay0)

okc_obj <- okc_rec %>%
  prep()
   
bake(okc_obj, okc_text) %>%
  select(starts_with("tfidf_essay0"))
#> # A tibble: 750 x 9,235
#>    tfidf_essay0_0 tfidf_essay0_01 tfidf_essay0_0a… tfidf_essay0_0a…
#>             <dbl>           <dbl>            <dbl>            <dbl>
#>  1              0               0                0                0
#>  2              0               0                0                0
#>  3              0               0                0                0
#>  4              0               0                0                0
#>  5              0               0                0                0
#>  6              0               0                0                0
#>  7              0               0                0                0
#>  8              0               0                0                0
#>  9              0               0                0                0
#> 10              0               0                0                0
#> # … with 740 more rows, and 9,231 more variables:
#> #   tfidf_essay0_0aboondocks <dbl>, tfidf_essay0_0abrothers <dbl>,
#> #   tfidf_essay0_0aconfidential <dbl>, tfidf_essay0_0aconversation <dbl>,
#> #   tfidf_essay0_0adebates <dbl>, tfidf_essay0_0afly <dbl>,
#> #   tfidf_essay0_0afriends <dbl>, tfidf_essay0_0agiants <dbl>,
#> #   tfidf_essay0_0ahop <dbl>, tfidf_essay0_0ahunters <dbl>,
#> #   tfidf_essay0_0aking <dbl>, tfidf_essay0_0amovies <dbl>,
#> #   tfidf_essay0_0amusic <dbl>, tfidf_essay0_0aparties <dbl>,
#> #   tfidf_essay0_0arailroading <dbl>, tfidf_essay0_0ashows <dbl>,
#> #   tfidf_essay0_0atrips <dbl>, tfidf_essay0_0aweapons <dbl>,
#> #   tfidf_essay0_1 <dbl>, tfidf_essay0_10 <dbl>, `tfidf_essay0_10,000` <dbl>,
#> #   tfidf_essay0_100 <dbl>, tfidf_essay0_1000 <dbl>, tfidf_essay0_105 <dbl>,
#> #   tfidf_essay0_11 <dbl>, tfidf_essay0_110 <dbl>, tfidf_essay0_1193 <dbl>,
#> #   tfidf_essay0_12 <dbl>, tfidf_essay0_125 <dbl>, tfidf_essay0_12s <dbl>,
#> #   tfidf_essay0_13 <dbl>, tfidf_essay0_1337 <dbl>, tfidf_essay0_14 <dbl>,
#> #   tfidf_essay0_1400 <dbl>, tfidf_essay0_15 <dbl>, tfidf_essay0_150 <dbl>,
#> #   tfidf_essay0_16 <dbl>, tfidf_essay0_16th <dbl>, tfidf_essay0_17 <dbl>,
#> #   tfidf_essay0_18 <dbl>, tfidf_essay0_180 <dbl>,
#> #   tfidf_essay0_1886866717 <dbl>, tfidf_essay0_19 <dbl>,
#> #   tfidf_essay0_1904 <dbl>, tfidf_essay0_1964 <dbl>, tfidf_essay0_1966 <dbl>,
#> #   tfidf_essay0_1982 <dbl>, tfidf_essay0_1988 <dbl>, tfidf_essay0_1991 <dbl>,
#> #   tfidf_essay0_1992 <dbl>, tfidf_essay0_1996 <dbl>, tfidf_essay0_1998 <dbl>,
#> #   tfidf_essay0_1st <dbl>, tfidf_essay0_2 <dbl>, tfidf_essay0_20 <dbl>,
#> #   tfidf_essay0_200 <dbl>, tfidf_essay0_2000s <dbl>, tfidf_essay0_2001 <dbl>,
#> #   tfidf_essay0_2005 <dbl>, tfidf_essay0_2007 <dbl>, tfidf_essay0_2008 <dbl>,
#> #   tfidf_essay0_2009 <dbl>, tfidf_essay0_2010 <dbl>, tfidf_essay0_2011 <dbl>,
#> #   tfidf_essay0_2012 <dbl>, tfidf_essay0_202 <dbl>, tfidf_essay0_2021 <dbl>,
#> #   tfidf_essay0_20s <dbl>, tfidf_essay0_20snot <dbl>, tfidf_essay0_20th <dbl>,
#> #   tfidf_essay0_21 <dbl>, tfidf_essay0_22 <dbl>, tfidf_essay0_23 <dbl>,
#> #   tfidf_essay0_23yo <dbl>, tfidf_essay0_24 <dbl>, tfidf_essay0_245lb <dbl>,
#> #   tfidf_essay0_25 <dbl>, tfidf_essay0_250 <dbl>, tfidf_essay0_26 <dbl>,
#> #   tfidf_essay0_27 <dbl>, tfidf_essay0_27ish <dbl>, tfidf_essay0_27s <dbl>,
#> #   tfidf_essay0_28 <dbl>, tfidf_essay0_28th <dbl>, tfidf_essay0_29 <dbl>,
#> #   tfidf_essay0_2cedd <dbl>, tfidf_essay0_2cehksxolna <dbl>,
#> #   tfidf_essay0_2fbowling <dbl>, tfidf_essay0_2fdarts <dbl>,
#> #   tfidf_essay0_2fodd <dbl>, tfidf_essay0_2foutdoor <dbl>,
#> #   tfidf_essay0_2nd <dbl>, tfidf_essay0_2wqv9 <dbl>, tfidf_essay0_3 <dbl>,
#> #   `tfidf_essay0_3,329` <dbl>, tfidf_essay0_30 <dbl>, tfidf_essay0_300 <dbl>,
#> #   tfidf_essay0_30ish <dbl>, tfidf_essay0_30s <dbl>, tfidf_essay0_31 <dbl>, …

Letter distributions

Another thing one might want to look at is the use of different letters in a certain text. For this we can use the built-in character tokenizer and keep only the characters using the step_stopwords step.

okc_rec <- recipe(~ ., data = okc_text) %>%
  step_tokenize(essay0, token = "characters") %>%
  step_stopwords(essay0, custom_stopword_source = letters, keep = TRUE) %>%
  step_tf(essay0)

okc_obj <- okc_rec %>%
  prep()
   
bake(okc_obj, okc_text) %>%
  select(starts_with("tf_essay0"))
#> # A tibble: 750 x 26
#>    tf_essay0_a tf_essay0_b tf_essay0_c tf_essay0_d tf_essay0_e tf_essay0_f
#>          <dbl>       <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
#>  1          80          32          22          25          79          13
#>  2           8           3           5           5           8           0
#>  3         127          30          36          59         148          36
#>  4          28           3           5          10          34           6
#>  5          19           9           6           6          34           9
#>  6          97          21          22          34         130          26
#>  7         110          25          32          46         146          23
#>  8          66          13           9          23          69          12
#>  9           1           0           0           0           1           0
#> 10         250          76         115         106         274          53
#> # … with 740 more rows, and 20 more variables: tf_essay0_g <dbl>,
#> #   tf_essay0_h <dbl>, tf_essay0_i <dbl>, tf_essay0_j <dbl>, tf_essay0_k <dbl>,
#> #   tf_essay0_l <dbl>, tf_essay0_m <dbl>, tf_essay0_n <dbl>, tf_essay0_o <dbl>,
#> #   tf_essay0_p <dbl>, tf_essay0_q <dbl>, tf_essay0_r <dbl>, tf_essay0_s <dbl>,
#> #   tf_essay0_t <dbl>, tf_essay0_u <dbl>, tf_essay0_v <dbl>, tf_essay0_w <dbl>,
#> #   tf_essay0_x <dbl>, tf_essay0_y <dbl>, tf_essay0_z <dbl>

TF-IDF of ngrams of stemmed tokens

Sometimes fairly complicated computations. Here we would like the term frequency inverse document frequency (TF-IDF) of the most common 500 ngrams done on stemmed tokens. It is quite a handful and would seldom be included as a option in most other libraries. But the modularity of textrecipes makes this task fairly easy.

First we will tokenize according to words, then stemming those words. We will then paste together the stemmed tokens using step_untokenize so we are back at string that we then tokenize again but this time using the ngram tokenizers. Lastly just filtering and tfidf as usual.

okc_rec <- recipe(~ ., data = okc_text) %>%
  step_tokenize(essay0, token = "words") %>%
  step_stem(essay0) %>%
  step_untokenize(essay0) %>%
  step_tokenize(essay0, token = "ngrams") %>%
  step_tokenfilter(essay0, max_tokens = 500) %>%
  step_tfidf(essay0)

okc_obj <- okc_rec %>%
  prep()
   
bake(okc_obj, okc_text) %>%
  select(starts_with("tfidf_essay0"))
#> # A tibble: 750 x 500
#>    `tfidf_essay0_a… `tfidf_essay0_a… `tfidf_essay0_a… `tfidf_essay0_a…
#>               <dbl>            <dbl>            <dbl>            <dbl>
#>  1                0                0                0                0
#>  2                0                0                0                0
#>  3                0                0                0                0
#>  4                0                0                0                0
#>  5                0                0                0                0
#>  6                0                0                0                0
#>  7                0                0                0                0
#>  8                0                0                0                0
#>  9                0                0                0                0
#> 10                0                0                0                0
#> # … with 740 more rows, and 496 more variables: `tfidf_essay0_a br br` <dbl>,
#> #   `tfidf_essay0_a class ilink` <dbl>, `tfidf_essay0_a coupl of` <dbl>,
#> #   `tfidf_essay0_a few year` <dbl>, `tfidf_essay0_a good time` <dbl>,
#> #   `tfidf_essay0_a i can` <dbl>, `tfidf_essay0_a laid back` <dbl>,
#> #   `tfidf_essay0_a littl bit` <dbl>, `tfidf_essay0_a long a` <dbl>,
#> #   `tfidf_essay0_a lot and` <dbl>, `tfidf_essay0_a lot of` <dbl>,
#> #   `tfidf_essay0_a lover who` <dbl>, `tfidf_essay0_a man who` <dbl>,
#> #   `tfidf_essay0_a much a` <dbl>, `tfidf_essay0_a part of` <dbl>,
#> #   `tfidf_essay0_a sens of` <dbl>, `tfidf_essay0_a well a` <dbl>,
#> #   `tfidf_essay0_about me i` <dbl>, `tfidf_essay0_all kind of` <dbl>,
#> #   `tfidf_essay0_all over the` <dbl>, `tfidf_essay0_all the time` <dbl>,
#> #   `tfidf_essay0_also like to` <dbl>, `tfidf_essay0_am a veri` <dbl>,
#> #   `tfidf_essay0_am look for` <dbl>, `tfidf_essay0_am not a` <dbl>,
#> #   `tfidf_essay0_and a class` <dbl>, `tfidf_essay0_and am a` <dbl>,
#> #   `tfidf_essay0_and enjoi the` <dbl>, `tfidf_essay0_and go to` <dbl>,
#> #   `tfidf_essay0_and have a` <dbl>, `tfidf_essay0_and have been` <dbl>,
#> #   `tfidf_essay0_and have fun` <dbl>, `tfidf_essay0_and i am` <dbl>,
#> #   `tfidf_essay0_and i don't` <dbl>, `tfidf_essay0_and i have` <dbl>,
#> #   `tfidf_essay0_and i like` <dbl>, `tfidf_essay0_and i love` <dbl>,
#> #   `tfidf_essay0_and i try` <dbl>, `tfidf_essay0_and i'm not` <dbl>,
#> #   `tfidf_essay0_and like to` <dbl>, `tfidf_essay0_and live in` <dbl>,
#> #   `tfidf_essay0_and look for` <dbl>, `tfidf_essay0_and love it` <dbl>,
#> #   `tfidf_essay0_and love the` <dbl>, `tfidf_essay0_and love to` <dbl>,
#> #   `tfidf_essay0_and rais in` <dbl>, `tfidf_essay0_and try new` <dbl>,
#> #   `tfidf_essay0_and try to` <dbl>, `tfidf_essay0_and work in` <dbl>,
#> #   `tfidf_essay0_and would love` <dbl>, `tfidf_essay0_at least onc` <dbl>,
#> #   `tfidf_essay0_at the same` <dbl>, `tfidf_essay0_back in the` <dbl>,
#> #   `tfidf_essay0_back to the` <dbl>, `tfidf_essay0_bai area for` <dbl>,
#> #   `tfidf_essay0_bai area i` <dbl>, `tfidf_essay0_be abl to` <dbl>,
#> #   `tfidf_essay0_be in the` <dbl>, `tfidf_essay0_big fan of` <dbl>,
#> #   `tfidf_essay0_bit of a` <dbl>, `tfidf_essay0_born and rais` <dbl>,
#> #   `tfidf_essay0_br a class` <dbl>, `tfidf_essay0_br br a` <dbl>,
#> #   `tfidf_essay0_br br also` <dbl>, `tfidf_essay0_br br and` <dbl>,
#> #   `tfidf_essay0_br br at` <dbl>, `tfidf_essay0_br br br` <dbl>,
#> #   `tfidf_essay0_br br for` <dbl>, `tfidf_essay0_br br here` <dbl>,
#> #   `tfidf_essay0_br br i` <dbl>, `tfidf_essay0_br br i'm` <dbl>,
#> #   `tfidf_essay0_br br i'v` <dbl>, `tfidf_essay0_br br if` <dbl>,
#> #   `tfidf_essay0_br br im` <dbl>, `tfidf_essay0_br br imagin` <dbl>,
#> #   `tfidf_essay0_br br in` <dbl>, `tfidf_essay0_br br my` <dbl>,
#> #   `tfidf_essay0_br br oh` <dbl>, `tfidf_essay0_br br on` <dbl>,
#> #   `tfidf_essay0_br br so` <dbl>, `tfidf_essay0_br br the` <dbl>,
#> #   `tfidf_essay0_br br to` <dbl>, `tfidf_essay0_br br what` <dbl>,
#> #   `tfidf_essay0_br br when` <dbl>, `tfidf_essay0_br br you` <dbl>,
#> #   `tfidf_essay0_br i also` <dbl>, `tfidf_essay0_br i am` <dbl>,
#> #   `tfidf_essay0_br i believ` <dbl>, `tfidf_essay0_br i can` <dbl>,
#> #   `tfidf_essay0_br i do` <dbl>, `tfidf_essay0_br i don't` <dbl>,
#> #   `tfidf_essay0_br i enjoi` <dbl>, `tfidf_essay0_br i grew` <dbl>,
#> #   `tfidf_essay0_br i have` <dbl>, `tfidf_essay0_br i just` <dbl>,
#> #   `tfidf_essay0_br i like` <dbl>, `tfidf_essay0_br i love` <dbl>,
#> #   `tfidf_essay0_br i realli` <dbl>, `tfidf_essay0_br i think` <dbl>,
#> #   `tfidf_essay0_br i wa` <dbl>, …