The speech package

Nicolas Schmidt, Diego Lujan, Juan Andres Moraes


Contains functions to convert floor speeches of Uruguayan legislators from PDF to tidy and clean data.frame.


# The development version from GitHub:
if (!require("remotes")) install.packages("remotes")

Data generation process

1 - Floor speeches

2 - Data extraction

3 - First construction of the database: speech::speech_build()

4 - Final database: speech::speech_build(., compiler = TRUE)


url <- ""
text <- speech::speech_build(file = url)
#> # A tibble: 24 x 6
#>    legislator speech         chamber  date                legislature id        
#>    <chr>      <chr>          <chr>    <dttm>                    <dbl> <chr>     
#>  1 BORDABERRY SEÑOR BORDABE… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  2 BORDABERRY SEÑOR BORDABE… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  3 AVIAGA     SEÑORA AVIAGA… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  4 AVIAGA     SEÑORA AVIAGA… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  5 GOI        SEÑOR GOÑI. P… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  6 GOI        SEÑOR GOÑI. E… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  7 MAHIA      SEÑOR MAHIA. … COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  8 MAHIA      SEÑOR MAHIA. … COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  9 ABDALA     SEÑOR ABDALA.… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#> 10 ASTI       SEÑOR ASTI. O… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#> # … with 14 more rows

speech_check(text, initial = c("A", "M"))
#> $A
#>   legislator
#> 1     ABDALA
#> 2       ASTI
#> 3     AVIAGA
#> $M
#>   legislator
#> 1      MAHIA
#> 2     MERONI

text <- speech_legis_replace(tidy_speech = text, old = "GOI", new = "GOÑI")
#> # A tibble: 24 x 6
#>    legislator speech         chamber  date                legislature id        
#>    <chr>      <chr>          <chr>    <dttm>                    <dbl> <chr>     
#>  1 BORDABERRY SEÑOR BORDABE… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  2 BORDABERRY SEÑOR BORDABE… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  3 AVIAGA     SEÑORA AVIAGA… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  4 AVIAGA     SEÑORA AVIAGA… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  5 GOÑI       SEÑOR GOÑI. P… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  6 GOÑI       SEÑOR GOÑI. E… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  7 MAHIA      SEÑOR MAHIA. … COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  8 MAHIA      SEÑOR MAHIA. … COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#>  9 ABDALA     SEÑOR ABDALA.… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#> 10 ASTI       SEÑOR ASTI. O… COMISIO… 2019-09-17 00:00:00          48 0?width=8…
#> # … with 14 more rows

text <- speech::speech_build(file = url, compiler = TRUE)
#> # A tibble: 11 x 6
#>    legislator legislature chamber    date   id              speech              
#>    <chr>      <chr>       <chr>      <chr>  <chr>           <chr>               
#>  1 ABDALA     48          COMISION … 2019-… 0?width=800&he… SEÑOR ABDALA. Voto …
#>  2 ASTI       48          COMISION … 2019-… 0?width=800&he… SEÑOR ASTI. Obviame…
#>  3 AVIAGA     48          COMISION … 2019-… 0?width=800&he… SEÑORA AVIAGA. Pido…
#>  4 BORDABERRY 48          COMISION … 2019-… 0?width=800&he… SEÑOR BORDABERRY. P…
#>  5 GOI        48          COMISION … 2019-… 0?width=800&he… SEÑOR GOÑI. Pido la…
#>  6 LAZO       48          COMISION … 2019-… 0?width=800&he… SEÑORA LAZO. Voto p…
#>  7 MAHIA      48          COMISION … 2019-… 0?width=800&he… SEÑOR MAHIA. Pido l…
#>  8 MERONI     48          COMISION … 2019-… 0?width=800&he… SEÑOR MERONI. Voto,…
#>  9 PEREYRA    48          COMISION … 2019-… 0?width=800&he… SEÑORA PEREYRA. Con…
#> 10 TOURNE     48          COMISION … 2019-… 0?width=800&he… SEÑORA TOURNE. Voy …
#> 11 VIERA      48          COMISION … 2019-… 0?width=800&he… SEÑOR VIERA. Voto p…

text$word <- speech_word_count(text$speech)

#> Observations: 11
#> Variables: 7
#> $ legislator  <chr> "ABDALA", "ASTI", "AVIAGA", "BORDABERRY", "GOI", "LAZO", …
#> $ legislature <chr> "48", "48", "48", "48", "48", "48", "48", "48", "48", "48…
#> $ date        <chr> "2019-09-17", "2019-09-17", "2019-09-17", "2019-09-17", "…
#> $ id          <chr> "0?width=800&height=600&hl=en_US1&iframe=true&rel=nofollo…
#> $ speech      <chr> "SEÑOR ABDALA. Voto por la señora legisladora Daisy Tourn…
#> $ word        <int> 400, 46, 107, 963, 100, 103, 128, 12, 12, 111, 8

Possible application


minchar <- function(string, min = 3){
    string <- stringr::str_remove_all(string, "[[:punct:]]")
    string <- unlist(strsplit(string, " "))
    string[nchar(string) > min]

text$speech %>% 
    minchar(., min = 4) %>%  
    quanteda::corpus() %>% 
    quanteda::dfm(remove = c("señor", "señora")) %>% 
    quanteda::textplot_wordcloud(color = rev(RColorBrewer::brewer.pal(10, "RdBu")))


text$speech %>% 
    minchar(., min = 4) %>%  
    tibble::enframe() %>% 
    tidytext::unnest_tokens(word, value) %>%
    dplyr::count(word, sort = TRUE) %>%
    dplyr::mutate(word = stats::reorder(word, n)) %>%
    dplyr::filter(!stringr::str_detect(word, "^señor")) %>% 
    .[1:40,] %>% 
    ggplot(aes(word, n)) +
        geom_col(col = "black", fill = "#00A08A",width = .7) +
        labs(x = "", y = "") +
        coord_flip() +


To cite speech in publications, please use:



Nicolas Schmidt (