-
Notifications
You must be signed in to change notification settings - Fork 31
Open
Description
Here's a reprex using a larger shapefile with US congressional districts. Based on a single polygon taking 10 seconds to pass through hungarian_costmin(), this should take over 80 minutes to render the full cartogram. I've reduced the shapefile size from 65 MB to 3 MB already, but is there a way the hungarian algorithm can be optimized to run faster during grid assignment?
library(geogrid)
library(sf)
library(tidyverse)
library(tmap)
# Download and read example data
tmp_shps <- tempfile()
tmp_dir <- tempdir()
download.file(
"https://www2.census.gov/geo/tiger/TIGER2016/CD/tl_2016_us_cd115.zip",
tmp_shps
)
unzip(tmp_shps, exdir = tmp_dir)
us_shp <- st_read(paste(tmp_dir, "tl_2016_us_cd115.shp", sep = "/"))
# > 64394144 bytes
shp <- us_shp %>%
filter(!STATEFP %in% c('15', '02', '60', '72', '69', '78', '66')) %>% # just mainland
as('Spatial') %>%
rmapshaper::ms_simplify() %>% # simplification using rmapshaper. reduces size from 65 MB to 3 MB
st_as_sf()
# shp %>% object.size()
# > 3594432 bytes
# commented out: initial rawplot as in README example #
# substrRight <- function(x, n){
# substr(x, nchar(x)-n+1, nchar(x))
# }
# shp$SNAME <- paste(shp$STATEFP %>% as.character(),
# substrRight(shp$NAMELSAD %>% as.character(), 2),
# sep = '_') # get congr distr #
# rawplot <- tm_shape(shp) +
# tm_polygons() +
# tm_text("SNAME")
# rawplot
# commented out: exploring different grid layouts, settled on hexgrid seed 6 #
# par(mfrow = c(2, 3), mar = c(0, 0, 2, 0))
# for (i in 1:6) {
# new_cells <- calculate_grid(shape = original_shapes, grid_type = "hexagonal", seed = i)
# plot(new_cells, main = paste("Seed", i, sep = " "))
# }
# Calculate Grid
new_cells_hex <- calculate_grid(shape = shp, grid_type = "hexagonal", seed = 6)
# Assign Polygons
## assign_polygons() doesn't complete here, sometimes crashes. takes extremely long time to run
# resulthex3 <- assign_polygons(shp, new_cells_hex)
#####################
##
# Where the delay is: hungarian_costmin within assign_polygons()
##
# within assign_polygons.sf, which converts to spdf, and uses
# assign_polygons.SpatialPolygonsDataFrame()
# running equivalent contents line by line, just 1st element / polygon within loop
shape <- shp %>% as('Spatial')
original_points <- rgeos::gCentroid(spf, byid = TRUE)
shape@data$CENTROIX <- original_points$x
shape@data$CENTROIY <- original_points$y
shape@data$key_orig <- paste(original_points$x, original_points$y,
sep = "_")
new_polygons <- new_cells_hex
new_points <- new_polygons[[1]]
vector_length <- length(shape)
new_polygons2 <- new_polygons[[2]]
s_poly <- sp::SpatialPolygonsDataFrame(new_polygons2, as.data.frame(sp::coordinates(new_polygons2)))
s_poly$key_new <- paste(s_poly@data$V1, s_poly@data$V2,
sep = "_")
closest_site_vec <- vector(mode = "numeric", length = vector_length)
min_dist_vec <- vector(mode = "numeric", length = vector_length)
taken_vec <- vector(mode = "numeric", length = vector_length)
taken_vec_index <- integer(vector_length)
## within loop ##
i <- 1 # just first iteration
dist_vec <- sp::spDistsN1(original_points, new_points[i],
longlat = FALSE)
min_dist_vec[i] <- min(dist_vec)
closest_site_vec[i] <- which.min(dist_vec)
taken_vec[i] <- which.min(dist_vec)
taken_vec_index <- taken_vec[taken_vec > 0]
costmatrix <- sp::spDists(original_points, new_points,
longlat = FALSE)
colnames(costmatrix) <- paste(s_poly@data$V1, s_poly@data$V2,
sep = "_")
rownames(costmatrix) <- paste(original_points@coords[, 1], original_points@coords[, 2], sep = "_")
#################################################
### everything up to here evaluates instantly ###
#################################################
system.time(hungarian_costmin <- hungarian_cc(costmatrix)) # Just one element takes 10.8 s, MBP 2.2 GHz i7 16 GB RAM
# user system elapsed
# 10.796 0.033 10.876
# end loop
# 11 * 436 obs ~ 4796 seconds / 80 minutes
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels