For these packages, if we have already installed them we can just load them using the “library()” command. If not, copy the name of the package into an “install.packages(”packagename”)” command to install them from CRAN.
library(sf)
Warning: package 'sf' was built under R version 4.3.3
Linking to GEOS 3.11.2, GDAL 3.8.2, PROJ 9.3.1; sf_use_s2() is TRUE
library(raster)
Loading required package: sp
library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.3.3
Warning: package 'tidyr' was built under R version 4.3.3
Warning: package 'readr' was built under R version 4.3.3
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::extract() masks raster::extract()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::select() masks raster::select()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(zoo)
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
library(broom)
Warning: package 'broom' was built under R version 4.3.3
library(stringr)library(ggpmisc)
Warning: package 'ggpmisc' was built under R version 4.3.3
Loading required package: ggpp
Warning: package 'ggpp' was built under R version 4.3.3
Registered S3 methods overwritten by 'ggpp':
method from
heightDetails.titleGrob ggplot2
widthDetails.titleGrob ggplot2
Attaching package: 'ggpp'
The following object is masked from 'package:ggplot2':
annotate
Warning in .recacheSubclasses(def@className, def, env): undefined subclass
"ndiMatrix" of class "replValueSp"; definition not updated
Warning: package 'rio' was built under R version 4.3.3
6.2 Loading in necessary files
Our shapefile can be found in the data fellowship common folder. Just adjust the filepath to use your own username before the Box section of the filepath.
# You may need to adjust the filepathusr <-Sys.info()[7]district_shp <-readRDS(paste0("C:/Users/", usr,"/Box/Data fellowship program planning/Data fellowship common folder/dataset for practice/zambia_districts.RDS"))#use your shapefile to make an outline of the country boundariesbnd <- district_shp %>% rmapshaper::ms_dissolve() %>%st_geometry()bnd_coords <-st_coordinates(district_shp)[,1:2] %>% as.data.frame %>%setNames(c("x", "y"))
Currently this script uses health facility location data from the Zambia NMEC instance of DHIS2.
#use your own username here in place of "wsheahan"nmec <-readRDS(paste0("C:/Users/", usr,"/Box/Data fellowship program planning/Data fellowship common folder/dataset for practice/zambia_facility_locations.RDS"))
mispecified_location <-function(src, by ="province", cmbl = nmec) {suppressMessages( out <- cmbl %>%filter(source == {{src}},!is.na(lon)) %>%st_as_sf(coords =c("lon", "lat"), crs =st_crs(bnd)) %>%st_join(district_shp) %>%mutate(lon =unlist(map(geometry,1)),lat =unlist(map(geometry,2))) %>%st_drop_geometry() %>% dplyr::select(org_unit_uid, lon, lat, province, geo_province, district, geo_district,everything()) )if(by =="province") {return(filter(out, province != geo_province))}if(by =="district") {return(filter(out, district != geo_district))}}mispecified_location("NMEC", by ="province")
# A tibble: 286 × 18
org_unit_uid lon lat province geo_province district geo_district
<chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 zcPf4NedM57 24.5 -14.0 Western Northwestern Lukulu Mufumbwe
2 J6bFDOwxKLx 24.5 -14.0 Western Northwestern Lukulu Mufumbwe
3 Bewp3Y613qp 24.5 -14.0 Western Northwestern Lukulu Mufumbwe
4 AnK50NuUn1M 26.8 -15.3 Southern Central Itezhi-tezhi Mumbwa
5 Eusw992Dk0H 25.1 -16.4 Southern Western Kazungula Mulobezi
6 DEaMRBXpa5H 25.1 -17.2 Southern Western Kazungula Mwandi
7 n3vwfWM2frv 26.6 -15.3 Southern Central Itezhi-tezhi Mumbwa
8 b9ZkT5JVZeq 27.8 -15.7 Southern Lusaka Mazabuka Kafue
9 q3ToaprI7Lg 26.6 -15.6 Central Southern Mumbwa Itezhi-tezhi
10 Ch6ck1OVbvz 28.0 -15.4 Central Lusaka Shibuyunji Chilanga
# ℹ 276 more rows
# ℹ 11 more variables: org_unit_level <int>, parent_name <chr>,
# short_name <chr>, name <chr>, created <chr>, source <chr>, c_month <chr>,
# c_year <chr>, pop_grid3_2022 <dbl>, area_km2 <dbl>, reported_district <chr>
mispecified_location("NMEC", by ="district")
# A tibble: 1,615 × 18
org_unit_uid lon lat province geo_province district geo_district
<chr> <dbl> <dbl> <chr> <chr> <chr> <chr>
1 cZhbmotVmjD 31.1 -14.5 Eastern Eastern Petauke Nyimba
2 zcPf4NedM57 24.5 -14.0 Western Northwestern Lukulu Mufumbwe
3 J6bFDOwxKLx 24.5 -14.0 Western Northwestern Lukulu Mufumbwe
4 tdZNyirEls7 22.8 -16.5 Western Western Shang'ombo Sioma
5 fET5Q3mqIIx 22.8 -16.5 Western Western Shang'ombo Sioma
6 Vdri40jbFvG 22.8 -16.5 Western Western Shang'ombo Sioma
7 xmF8B7xkbkU 28.2 -15.5 Lusaka Lusaka Chilanga Lusaka
8 sEjZ3MhIYAE 28.2 -15.5 Lusaka Lusaka Chilanga Lusaka
9 Tnr0CSfapvM 28.2 -15.5 Lusaka Lusaka Chilanga Lusaka
10 fcq9iqgC52e 23.7 -13.9 Northwestern Northwestern Kabompo Zambezi
# ℹ 1,605 more rows
# ℹ 11 more variables: org_unit_level <int>, parent_name <chr>,
# short_name <chr>, name <chr>, created <chr>, source <chr>, c_month <chr>,
# c_year <chr>, pop_grid3_2022 <dbl>, area_km2 <dbl>, reported_district <chr>
Misspecified_location does something similar. With this function we can build a dataset of all the points that we believe are in the “wrong” district and then shift their coordinates as needed.
mis_loc_chw <-mispecified_location("NMEC", by ="district")#Shift coordinatesshift_coord <-function(org_coords, valid_coords) {# input_class <- class(org_coords) org_coords <-as.data.frame(org_coords) n <-nrow(org_coords) x <- y <-numeric(length = n)for(i in1:n){ new_loc <- valid_coords[which.min( (valid_coords[,1]-org_coords[i,1])^2+ (valid_coords[,2]-org_coords[i,2])^2),] x[i] <- new_loc[1] y[i] <- new_loc[2] } out <-cbind(x, y)# If input is a dataframe, return a dataframe with the same column namesif(class(org_coords) =="data.frame") { out <-as.data.frame(out)names(out) <-names(org_coords) }return(out)}
These are some standard geocorrections for common errors such as lat/lon being 0, having a - or + flipped in the lat/lon, or when lat/lon are flipped it basically checks to see if fixing those common issues would put the point back into the bnd area where it belongs.
std_geocorrections <-outside_boundary("NMEC") %>%mutate(new_lon =case_when(# Set 0 to NA lon ==0~NA_real_,# Fixing signsbetween(-lon, min(bnd_coords$x), max(bnd_coords$x)) &between(-lat, min(bnd_coords$y), max(bnd_coords$y)) ~-lon,# Lat and lon flippedbetween(lon, min(bnd_coords$y), max(bnd_coords$y)) &between(lat, min(bnd_coords$x), max(bnd_coords$x)) ~ lat,# Otherwise leave unchangedTRUE~ lon),new_lat =case_when(# Set 0 to NA lat ==0~NA_real_,# Fixing signsbetween(-lon, min(bnd_coords$x), max(bnd_coords$x)) &between(-lat, min(bnd_coords$y), max(bnd_coords$y)) ~-lat,# Lat and lon flippedbetween(lon, min(bnd_coords$y), max(bnd_coords$y)) &between(lat, min(bnd_coords$x), max(bnd_coords$x)) ~ lon,# Only lat is negative# between(!lat, min(bnd_coords$y), max(bnd_coords$y)) &# between(-lat, min(bnd_coords$y), max(bnd_coords$y)) ~ -lat, lat >0~-lat,# Otherwise leave unchangedTRUE~ lat) ) %>%mutate(# Double flips (sign and x,y)new_lon2 =case_when(between(-new_lon, min(bnd_coords$y), max(bnd_coords$y)) ~-new_lat,TRUE~ new_lon ),new_lat2 =case_when(between(-new_lat, min(bnd_coords$x), max(bnd_coords$x)) ~-new_lon,TRUE~ new_lat ) )#EDIT: Dropped in fix to new_lon/lat fields (Check)std_geocorrections <- std_geocorrections %>%mutate(lon = new_lon2, lat = new_lat2) %>% dplyr::select(-c(new_lon, new_lon2, new_lat, new_lat2))
We can rerun outside_boundary on the list of “fixed” points to figure out which ones are still outside the boundary.
we can visually check these again, we should have a visibly smaller number of unfixed points than before.
ggplot(unfixed_geo) +geom_sf(data = bnd) +geom_point(aes(lon, lat, color = source), shape =21)
# This are corrected and can be swapped back instd_geo_fixed <-anti_join(std_geocorrections, unfixed_geo)
Joining with `by = join_by(org_unit_uid, lon, lat, org_unit_level, parent_name,
province, district, short_name, name, created, source, c_month, c_year)`
#Move bordering coordinates (<= 1 km) inside countrynear_boundary <-outside_boundary("NMEC", cmbl = std_geocorrections, return_sf = T) %>%mutate(near_bnd =st_is_within_distance(., bnd, dist = units::as_units(1, "km"), sparse = F)[,1]) %>%mutate(lon =unlist(map(geometry,1)),lat =unlist(map(geometry,2))) %>%st_drop_geometry() %>%filter(near_bnd ==TRUE) %>%mutate(new_lon =unlist(shift_coord(dplyr::select(., lon, lat), bnd_coords)[,1]),new_lat =unlist(shift_coord(dplyr::select(., lon, lat), bnd_coords)[,2]),lon = new_lon, lat = new_lat) %>% dplyr::select(-c(near_bnd, new_lon, new_lat))#join the corrected locations with the ones we just nudged insidecorrected_geolocations <-bind_rows(std_geo_fixed, near_boundary)#visualize to make sure they are all inside the boundaryggplot(corrected_geolocations) +geom_sf(data = bnd) +geom_point(aes(lon, lat, color = source))
We then want to set our remaining out-of-bound locations to NA and then visually establish that our final list of geocorrected entries are all within the bounds of our geography.
unfixed_locations <-outside_boundary("NMEC") %>%anti_join(corrected_geolocations, by =c("org_unit_uid"="org_unit_uid", "source"="source")) %>%mutate(lon =NA_real_, lat =NA_real_)geocorrect_combined_list <- nmec %>%# Remove original entriesanti_join(outside_boundary("NMEC")) %>%bind_rows(corrected_geolocations, unfixed_locations)
Joining with `by = join_by(org_unit_uid, org_unit_level, lon, lat, parent_name,
province, district, short_name, name, created, source, c_month, c_year)`
#get a finalized list of corrected chwsgc_nmec <- geocorrect_combined_list %>%filter(source =="NMEC") %>%filter(org_unit_level ==5)#Plot the points to make sure none remaining outside Zambiaggplot(gc_nmec) +geom_sf(data = bnd) +geom_point(aes(lon, lat, color = source), alpha =0.5)
Warning: Removed 2385 rows containing missing values or values outside the scale range
(`geom_point()`).