queryparser

queryparser translates SQL queries into lists of unevaluated R expressions.

⚠️ Most R users should not directly use queryparser. Instead, use it through tidyquery.

Installation

Install the released version of queryparser from CRAN with:

install.packages("queryparser")

Or install the development version from GitHub with:

# install.packages("devtools")
devtools::install_github("ianmcook/queryparser")

Usage

Call the function parse_query(), passing a SELECT statement enclosed in quotes as the first argument:

library(queryparser)

parse_query("SELECT DISTINCT carrier FROM flights WHERE dest = 'HNL'")
#> $select
#> $select[[1]]
#> carrier
#> 
#> attr(,"distinct")
#> [1] TRUE
#> 
#> $from
#> $from[[1]]
#> flights
#> 
#> 
#> $where
#> $where[[1]]
#> dest == "HNL"

Queries can include the clauses SELECT, FROM, WHERE, GROUP BY, HAVING, ORDER BY, and LIMIT:

parse_query(
" SELECT origin, dest,
    COUNT(flight) AS num_flts,
    round(SUM(seats)) AS num_seats,
    round(AVG(arr_delay)) AS avg_delay
  FROM flights f LEFT OUTER JOIN planes p
    ON f.tailnum = p.tailnum
  WHERE distance BETWEEN 200 AND 300
    AND air_time IS NOT NULL
  GROUP BY origin, dest
  HAVING num_flts > 3000
  ORDER BY num_seats DESC, avg_delay ASC
  LIMIT 2;"
)
#> $select
#> $select[[1]]
#> origin
#> 
#> $select[[2]]
#> dest
#> 
#> $select$num_flts
#> sum(!is.na(flight))
#> 
#> $select$num_seats
#> round(sum(seats, na.rm = TRUE))
#> 
#> $select$avg_delay
#> round(mean(arr_delay, na.rm = TRUE))
#> 
#> attr(,"aggregate")
#>                      num_flts num_seats avg_delay 
#>     FALSE     FALSE      TRUE      TRUE      TRUE 
#> 
#> $from
#> $from$f
#> flights
#> 
#> $from$p
#> planes
#> 
#> attr(,"join_types")
#> [1] "left outer join"
#> attr(,"join_conditions")
#> attr(,"join_conditions")[[1]]
#> f.tailnum == p.tailnum
#> 
#> 
#> $where
#> $where[[1]]
#> (distance >= 200 & distance <= 300) & !is.na(air_time)
#> 
#> 
#> $group_by
#> $group_by[[1]]
#> origin
#> 
#> $group_by[[2]]
#> dest
#> 
#> 
#> $having
#> $having[[1]]
#> num_flts > 3000
#> 
#> 
#> $order_by
#> $order_by[[1]]
#> -xtfrm(num_seats)
#> 
#> $order_by[[2]]
#> avg_delay
#> 
#> attr(,"aggregate")
#> [1] FALSE FALSE
#> 
#> $limit
#> $limit[[1]]
#> [1] 2
#> 
#> 
#> attr(,"aggregate")
#> [1] TRUE

Set the argument tidyverse to TRUE to use functions from tidyverse packages including dplyr, stringr, and lubridate in the R expressions:

parse_query("SELECT COUNT(*) AS n FROM t WHERE x BETWEEN y AND z ORDER BY n DESC", tidyverse = TRUE)
#> $select
#> $select$n
#> dplyr::n()
#> 
#> attr(,"aggregate")
#>    n 
#> TRUE 
#> 
#> $from
#> $from[[1]]
#> t
#> 
#> 
#> $where
#> $where[[1]]
#> dplyr::between(x, y, z)
#> 
#> 
#> $order_by
#> $order_by[[1]]
#> dplyr::desc(n)
#> 
#> attr(,"aggregate")
#> [1] FALSE
#> 
#> attr(,"aggregate")
#> [1] TRUE

queryparser will translate only explicitly allowed functions and operators, preventing injection of malicious code:

parse_query("SELECT x FROM y WHERE system('rm -rf /')")
#> Error: Unrecognized function or operator: system

Current Limitations

queryparser does not currently support:

queryparser currently has the following known limitations:

Non-Goals

queryparser is not intended to:

The sqlparseR package (CRAN) provides a wrapper around the Python module sqlparse.