Clean Biological Occurrence Records
Clean using the following use cases (checkmarks indicate fxns exist - not necessarily complete):
taxize
(one method so far)A note about examples: We think that using a piping workflow with %>%
makes code easier to build up, and easier to understand. However, in some examples we provide examples without the pipe to demonstrate traditional usage.
Stable CRAN version
Development version
Remove impossible coordinates (using sample data included in the pkg)
# coord_impossible(dframe(sample_data_1)) # w/o pipe
dframe(sample_data_1) %>% coord_impossible()
#> # A tibble: 1,500 x 5
#> name longitude latitude date key
#> * <chr> <dbl> <dbl> <dttm> <int>
#> 1 Ursus americanus -79.7 38.4 2015-01-14 16:36:45 1065590124
#> 2 Ursus americanus -82.4 35.7 2015-01-13 00:25:39 1065588899
#> 3 Ursus americanus -99.1 23.7 2015-02-20 23:00:00 1098894889
#> 4 Ursus americanus -72.8 43.9 2015-02-13 16:16:41 1065611122
#> 5 Ursus americanus -72.3 43.9 2015-03-01 20:20:45 1088908315
#> 6 Ursus americanus -109. 32.7 2015-03-29 17:06:54 1088932238
#> 7 Ursus americanus -109. 32.7 2015-03-29 17:12:50 1088932273
#> 8 Ursus americanus -124. 40.1 2015-03-28 23:00:00 1132403409
#> 9 Ursus americanus -78.3 36.9 2015-03-20 21:11:24 1088923534
#> 10 Ursus americanus -76.8 35.5 2015-04-05 23:00:00 1088954559
#> # … with 1,490 more rows
Remove incomplete coordinates
# coord_incomplete(dframe(sample_data_1)) # w/o pipe
dframe(sample_data_1) %>% coord_incomplete()
#> # A tibble: 1,306 x 5
#> name longitude latitude date key
#> * <chr> <dbl> <dbl> <dttm> <int>
#> 1 Ursus americanus -79.7 38.4 2015-01-14 16:36:45 1065590124
#> 2 Ursus americanus -82.4 35.7 2015-01-13 00:25:39 1065588899
#> 3 Ursus americanus -99.1 23.7 2015-02-20 23:00:00 1098894889
#> 4 Ursus americanus -72.8 43.9 2015-02-13 16:16:41 1065611122
#> 5 Ursus americanus -72.3 43.9 2015-03-01 20:20:45 1088908315
#> 6 Ursus americanus -109. 32.7 2015-03-29 17:06:54 1088932238
#> 7 Ursus americanus -109. 32.7 2015-03-29 17:12:50 1088932273
#> 8 Ursus americanus -124. 40.1 2015-03-28 23:00:00 1132403409
#> 9 Ursus americanus -78.3 36.9 2015-03-20 21:11:24 1088923534
#> 10 Ursus americanus -76.8 35.5 2015-04-05 23:00:00 1088954559
#> # … with 1,296 more rows
Remove unlikely coordinates (e.g., those at 0,0)
# coord_unlikely(dframe(sample_data_1)) # w/o pipe
dframe(sample_data_1) %>% coord_unlikely()
#> # A tibble: 1,488 x 5
#> name longitude latitude date key
#> * <chr> <dbl> <dbl> <dttm> <int>
#> 1 Ursus americanus -79.7 38.4 2015-01-14 16:36:45 1065590124
#> 2 Ursus americanus -82.4 35.7 2015-01-13 00:25:39 1065588899
#> 3 Ursus americanus -99.1 23.7 2015-02-20 23:00:00 1098894889
#> 4 Ursus americanus -72.8 43.9 2015-02-13 16:16:41 1065611122
#> 5 Ursus americanus -72.3 43.9 2015-03-01 20:20:45 1088908315
#> 6 Ursus americanus -109. 32.7 2015-03-29 17:06:54 1088932238
#> 7 Ursus americanus -109. 32.7 2015-03-29 17:12:50 1088932273
#> 8 Ursus americanus -124. 40.1 2015-03-28 23:00:00 1132403409
#> 9 Ursus americanus -78.3 36.9 2015-03-20 21:11:24 1088923534
#> 10 Ursus americanus -76.8 35.5 2015-04-05 23:00:00 1088954559
#> # … with 1,478 more rows
Do all three
dframe(sample_data_1) %>%
coord_impossible() %>%
coord_incomplete() %>%
coord_unlikely()
#> # A tibble: 1,294 x 5
#> name longitude latitude date key
#> * <chr> <dbl> <dbl> <dttm> <int>
#> 1 Ursus americanus -79.7 38.4 2015-01-14 16:36:45 1065590124
#> 2 Ursus americanus -82.4 35.7 2015-01-13 00:25:39 1065588899
#> 3 Ursus americanus -99.1 23.7 2015-02-20 23:00:00 1098894889
#> 4 Ursus americanus -72.8 43.9 2015-02-13 16:16:41 1065611122
#> 5 Ursus americanus -72.3 43.9 2015-03-01 20:20:45 1088908315
#> 6 Ursus americanus -109. 32.7 2015-03-29 17:06:54 1088932238
#> 7 Ursus americanus -109. 32.7 2015-03-29 17:12:50 1088932273
#> 8 Ursus americanus -124. 40.1 2015-03-28 23:00:00 1132403409
#> 9 Ursus americanus -78.3 36.9 2015-03-20 21:11:24 1088923534
#> 10 Ursus americanus -76.8 35.5 2015-04-05 23:00:00 1088954559
#> # … with 1,284 more rows
Don’t drop bad data
dframe(sample_data_1) %>% coord_incomplete(drop = TRUE) %>% NROW
#> [1] 1306
dframe(sample_data_1) %>% coord_incomplete(drop = FALSE) %>% NROW
#> [1] 1500
smalldf <- sample_data_1[1:20, ]
# create a duplicate record
smalldf <- rbind(smalldf, smalldf[10,])
row.names(smalldf) <- NULL
# make it slightly different
smalldf[21, "key"] <- 1088954555
NROW(smalldf)
#> [1] 21
dp <- dframe(smalldf) %>% dedup()
NROW(dp)
#> [1] 20
attr(dp, "dups")
#> # A tibble: 1 x 5
#> name longitude latitude date key
#> <chr> <dbl> <dbl> <dttm> <dbl>
#> 1 Ursus americanus -76.8 35.5 2015-04-05 23:00:00 1088954555
Standardize/convert dates
df <- sample_data_1
# date_standardize(dframe(df), "%d%b%Y") # w/o pipe
dframe(df) %>% date_standardize("%d%b%Y")
#> # A tibble: 1,500 x 5
#> name longitude latitude date key
#> <chr> <dbl> <dbl> <chr> <int>
#> 1 Ursus americanus -79.7 38.4 14Jan2015 1065590124
#> 2 Ursus americanus -82.4 35.7 13Jan2015 1065588899
#> 3 Ursus americanus -99.1 23.7 20Feb2015 1098894889
#> 4 Ursus americanus -72.8 43.9 13Feb2015 1065611122
#> 5 Ursus americanus -72.3 43.9 01Mar2015 1088908315
#> 6 Ursus americanus -109. 32.7 29Mar2015 1088932238
#> 7 Ursus americanus -109. 32.7 29Mar2015 1088932273
#> 8 Ursus americanus -124. 40.1 28Mar2015 1132403409
#> 9 Ursus americanus -78.3 36.9 20Mar2015 1088923534
#> 10 Ursus americanus -76.8 35.5 05Apr2015 1088954559
#> # … with 1,490 more rows
Drop records without dates
Create date field from other fields
dframe(sample_data_2) %>% date_create(year, month, day)
#> # A tibble: 1,500 x 8
#> name longitude latitude key year month day date
#> <chr> <dbl> <dbl> <int> <chr> <chr> <chr> <chr>
#> 1 Ursus americanus -79.7 38.4 1065590124 2015 01 14 2015-01-14
#> 2 Ursus americanus -82.4 35.7 1065588899 2015 01 13 2015-01-13
#> 3 Ursus americanus -99.1 23.7 1098894889 2015 02 20 2015-02-20
#> 4 Ursus americanus -72.8 43.9 1065611122 2015 02 13 2015-02-13
#> 5 Ursus americanus -72.3 43.9 1088908315 2015 03 01 2015-03-01
#> 6 Ursus americanus -109. 32.7 1088932238 2015 03 29 2015-03-29
#> 7 Ursus americanus -109. 32.7 1088932273 2015 03 29 2015-03-29
#> 8 Ursus americanus -124. 40.1 1132403409 2015 03 28 2015-03-28
#> 9 Ursus americanus -78.3 36.9 1088923534 2015 03 20 2015-03-20
#> 10 Ursus americanus -76.8 35.5 1088954559 2015 04 05 2015-04-05
#> # … with 1,490 more rows
Filter by FAO areas
wkt <- 'POLYGON((72.2 38.5,-173.6 38.5,-173.6 -41.5,72.2 -41.5,72.2 38.5))'
manta_ray <- rgbif::name_backbone("Mobula alfredi")$usageKey
res <- rgbif::occ_data(manta_ray, geometry = wkt, limit=300, hasCoordinate = TRUE)
dat <- sf::st_as_sf(res$data, coords = c("decimalLongitude", "decimalLatitude"))
dat <- sf::st_set_crs(dat, 4326)
mapview::mapview(dat)
tmp <- eco_region(dframe(res$data), dataset = "fao", region = "OCEAN:Indian")
tmp <- tmp[!is.na(tmp$decimalLongitude), ]
tmp2 <- sf::st_as_sf(tmp, coords = c("decimalLongitude", "decimalLatitude"))
tmp2 <- sf::st_set_crs(tmp2, 4326)
mapview::mapview(tmp2)
scrubr
in R doing citation(package = 'scrubr')