library("elastic")
The main interface to searching documents in your Elasticsearch store is the function Search()
. I nearly always develop R software using all lowercase, but R has a function called search()
, and I wanted to avoid collision with that function.
Search()
is an interface to both the HTTP search API (in which queries are passed in the URI of the request, meaning queries have to be relatively simple), as well as the POST API, or the Query DSL, in which queries are passed in the body of the request (so can be much more complex).
There are a huge amount of ways you can search Elasticsearch documents - this tutorial covers some of them, and highlights the ways in which you interact with the R outputs.
x <- connect()
out <- Search(x, index="shakespeare")
out$hits$total
#> [1] 5000
out$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "line"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Search(x, index = "shakespeare", type = "line")$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "line"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Search(x, index = "shakespeare", body = '{
"_source": ["play_name", "speaker"]
}')$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "line"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$speaker
#> [1] ""
Search(x, index="shakespeare", size=1, from=1)$hits
#> $total
#> [1] 5000
#>
#> $max_score
#> [1] 1
#>
#> $hits
#> $hits[[1]]
#> $hits[[1]]$`_index`
#> [1] "shakespeare"
#>
#> $hits[[1]]$`_type`
#> [1] "line"
#>
#> $hits[[1]]$`_id`
#> [1] "14"
#>
#> $hits[[1]]$`_score`
#> [1] 1
#>
#> $hits[[1]]$`_source`
#> $hits[[1]]$`_source`$line_id
#> [1] 15
#>
#> $hits[[1]]$`_source`$play_name
#> [1] "Henry IV"
#>
#> $hits[[1]]$`_source`$speech_number
#> [1] 1
#>
#> $hits[[1]]$`_source`$line_number
#> [1] "1.1.12"
#>
#> $hits[[1]]$`_source`$speaker
#> [1] "KING HENRY IV"
#>
#> $hits[[1]]$`_source`$text_entry
#> [1] "Did lately meet in the intestine shock"
Using the q
parameter you can pass in a query, which gets passed in the URI of the query. This type of query is less powerful than the below query passed in the body of the request, using the body
parameter.
Search(x, index="shakespeare", type="line", q="speaker:KING HENRY IV")$hits$total
#> [1] 5000
Here, query for values from 10 to 20 in the field line_id
Search(x, index="shakespeare", q="line_id:[10 TO 20]")$hits$total
#> [1] 11
Version number usually is not returned.
sapply(Search(x, index="shakespeare", version=TRUE, size=2)$hits$hits, "[[", "_version")
#> [1] 3 4
Search(x, index="shakespeare", type="line", raw=TRUE)
#> [1] "{\"took\":0,\"timed_out\":false,\"_shards\":{\"total\":5,\"successful\":5,\"skipped\":0,\"failed\":0},\"hits\":{\"total\":5000,\"max_score\":1.0,\"hits\":[{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"0\",\"_score\":1.0,\"_source\":{\"line_id\":1,\"play_name\":\"Henry IV\",\"line_number\":\"\",\"speaker\":\"\",\"text_entry\":\"ACT I\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"14\",\"_score\":1.0,\"_source\":{\"line_id\":15,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.12\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Did lately meet in the intestine shock\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"19\",\"_score\":1.0,\"_source\":{\"line_id\":20,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.17\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"The edge of war, like an ill-sheathed knife,\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"22\",\"_score\":1.0,\"_source\":{\"line_id\":23,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.20\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Whose soldier now, under whose blessed cross\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"24\",\"_score\":1.0,\"_source\":{\"line_id\":25,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.22\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Forthwith a power of English shall we levy;\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"25\",\"_score\":1.0,\"_source\":{\"line_id\":26,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.23\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"Whose arms were moulded in their mothers womb\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"26\",\"_score\":1.0,\"_source\":{\"line_id\":27,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.24\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"To chase these pagans in those holy fields\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"29\",\"_score\":1.0,\"_source\":{\"line_id\":30,\"play_name\":\"Henry IV\",\"speech_number\":1,\"line_number\":\"1.1.27\",\"speaker\":\"KING HENRY IV\",\"text_entry\":\"For our advantage on the bitter cross.\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"40\",\"_score\":1.0,\"_source\":{\"line_id\":41,\"play_name\":\"Henry IV\",\"speech_number\":2,\"line_number\":\"1.1.38\",\"speaker\":\"WESTMORELAND\",\"text_entry\":\"Whose worst was, that the noble Mortimer,\"}},{\"_index\":\"shakespeare\",\"_type\":\"line\",\"_id\":\"41\",\"_score\":1.0,\"_source\":{\"line_id\":42,\"play_name\":\"Henry IV\",\"speech_number\":2,\"line_number\":\"1.1.39\",\"speaker\":\"WESTMORELAND\",\"text_entry\":\"Leading the men of Herefordshire to fight\"}}]}}"
Common options are verbose=TRUE
, timeout_ms=1
, followlocation=TRUE
.
out <- Search(x, index="shakespeare", type="line", verbose = TRUE)
Pass in as an R list
mapping_create(x, "shakespeare", "line", update_all_types = TRUE, body = '{
"properties": {
"text_entry": {
"type": "text",
"fielddata": true
}
}
}')
#> $acknowledged
#> [1] TRUE
aggs <- list(aggs = list(stats = list(terms = list(field = "text_entry"))))
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "line"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Or pass in as json query with newlines, easy to read
aggs <- '{
"aggs": {
"stats" : {
"terms" : {
"field" : "text_entry"
}
}
}
}'
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "line"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Or pass in collapsed json string
aggs <- '{"aggs":{"stats":{"terms":{"field":"text_entry"}}}}'
Search(x, index="shakespeare", body=aggs)$hits$hits[[1]]
#> $`_index`
#> [1] "shakespeare"
#>
#> $`_type`
#> [1] "line"
#>
#> $`_id`
#> [1] "0"
#>
#> $`_score`
#> [1] 1
#>
#> $`_source`
#> $`_source`$line_id
#> [1] 1
#>
#> $`_source`$play_name
#> [1] "Henry IV"
#>
#> $`_source`$line_number
#> [1] ""
#>
#> $`_source`$speaker
#> [1] ""
#>
#> $`_source`$text_entry
#> [1] "ACT I"
Histograms
aggs <- '{
"aggs": {
"latbuckets" : {
"histogram" : {
"field" : "decimalLatitude",
"interval" : 5
}
}
}
}'
Search(x, index="gbif", body=aggs, size=0)$aggregations$latbuckets$buckets[1:3]
#> [[1]]
#> [[1]]$key
#> [1] -35
#>
#> [[1]]$doc_count
#> [1] 1
#>
#>
#> [[2]]
#> [[2]]$key
#> [1] -30
#>
#> [[2]]$doc_count
#> [1] 0
#>
#>
#> [[3]]
#> [[3]]$key
#> [1] -25
#>
#> [[3]]$doc_count
#> [1] 0
mmatch <- '{
"query": {
"bool" : {
"must_not" : {
"range" : {
"speech_number" : {
"from" : 1, "to": 5
}}}}}}'
sapply(Search(x, index="shakespeare", body=mmatch)$hits$hits, function(x) x$`_source`$speech_number)
#> [[1]]
#> NULL
#>
#> [[2]]
#> [1] 6
#>
#> [[3]]
#> [1] 7
#>
#> [[4]]
#> [1] 7
#>
#> [[5]]
#> [1] 7
#>
#> [[6]]
#> [1] 8
#>
#> [[7]]
#> [1] 8
#>
#> [[8]]
#> [1] 9
#>
#> [[9]]
#> [1] 9
#>
#> [[10]]
#> [1] 10
Fuzzy query on numerics
fuzzy <- list(query = list(fuzzy = list(text_entry = "arms")))
Search(x, index="shakespeare", body = fuzzy)$hits$total
#> [1] 49
fuzzy <- list(query = list(fuzzy = list(text_entry = list(value = "arms", fuzziness = 4))))
Search(x, index="shakespeare", body=fuzzy)$hits$total
#> [1] 617
With numeric
body <- list(query=list(range=list(decimalLongitude=list(gte=1, lte=3))))
Search(x, 'gbif', body=body)$hits$total
#> [1] 24
body <- list(query=list(range=list(decimalLongitude=list(gte=2.9, lte=10))))
Search(x, 'gbif', body=body)$hits$total
#> [1] 126
With dates
body <- list(query=list(range=list(eventDate=list(gte="2012-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total
#> [1] 301
body <- list(query=list(range=list(eventDate=list(gte="2014-01-01", lte="now"))))
Search(x, 'gbif', body=body)$hits$total
#> [1] 292
body <- '{
"query": {
"more_like_this": {
"fields": ["abstract","title"],
"like": "and then",
"min_term_freq": 1,
"max_query_terms": 12
}
}
}'
Search(x, 'plos', body=body)$hits$total
#> [1] 488
body <- '{
"query": {
"more_like_this": {
"fields": ["abstract","title"],
"like": "cell",
"min_term_freq": 1,
"max_query_terms": 12
}
}
}'
Search(x, 'plos', body=body)$hits$total
#> [1] 58
body <- '{
"query": {
"query_string": {
"query" : "cell"
}
},
"highlight": {
"fields": {
"title": {"number_of_fragments": 2}
}
}
}'
out <- Search(x, 'plos', 'article', body=body)
out$hits$total
#> [1] 58
sapply(out$hits$hits, function(x) x$highlight$title[[1]])[8:10]
#> [1] "Chronic Hypoxia Promotes Pulmonary Artery Endothelial <em>Cell</em> Proliferation through H2O2-Induced 5-Lipoxygenase"
#> [2] "Dynamic Visualization of Dendritic <em>Cell</em>-Antigen Interactions in the Skin Following Transcutaneous Immunization"
#> [3] "A New Class of Pluripotent Stem <em>Cell</em> Cytotoxic Small Molecules"
Search(x, 'shakespeare', q="a*")$hits$total
#> [1] 2747
res <- Search(x, index = 'shakespeare', q="a*", time_scroll = "1m")
length(scroll(x, res$`_scroll_id`, time_scroll = "1m")$hits$hits)
#> [1] 10
res <- Search(x, index = 'shakespeare', q = "a*", time_scroll = "5m")
out <- res$hits$hits
hits <- 1
while (hits != 0) {
res <- scroll(x, res$`_scroll_id`)
hits <- length(res$hits$hits)
if (hits > 0)
out <- c(out, res$hits$hits)
}
length(out)
#> [1] 2747
res$hits$total
#> [1] 2747
Woohoo! Collected all 2747 documents in very little time.