Skip to content

Slow processing with assign_polygons() (hungarian_costmin()) for 3 MB shapefile #30

@lgtm70b

Description

@lgtm70b

Here's a reprex using a larger shapefile with US congressional districts. Based on a single polygon taking 10 seconds to pass through hungarian_costmin(), this should take over 80 minutes to render the full cartogram. I've reduced the shapefile size from 65 MB to 3 MB already, but is there a way the hungarian algorithm can be optimized to run faster during grid assignment?

library(geogrid)
library(sf)
library(tidyverse)
library(tmap)

# Download and read example data
tmp_shps <- tempfile()
tmp_dir <- tempdir()
download.file(
  "https://www2.census.gov/geo/tiger/TIGER2016/CD/tl_2016_us_cd115.zip",
  tmp_shps
)
unzip(tmp_shps, exdir = tmp_dir)
us_shp <- st_read(paste(tmp_dir, "tl_2016_us_cd115.shp", sep = "/"))
  # > 64394144 bytes
shp <- us_shp %>% 
  filter(!STATEFP %in% c('15', '02', '60', '72', '69', '78', '66')) %>% # just mainland
  as('Spatial') %>%
  rmapshaper::ms_simplify() %>% # simplification using rmapshaper. reduces size from 65 MB to 3 MB
  st_as_sf()

# shp %>% object.size()
# > 3594432 bytes

# commented out: initial rawplot as in README example #
# substrRight <- function(x, n){
# substr(x, nchar(x)-n+1, nchar(x))
# }
# shp$SNAME <- paste(shp$STATEFP %>% as.character(), 
#                   substrRight(shp$NAMELSAD %>% as.character(), 2), 
#                               sep = '_') # get congr distr #
# rawplot <- tm_shape(shp) +
#   tm_polygons() +
#   tm_text("SNAME")
# rawplot

# commented out: exploring different grid layouts, settled on hexgrid seed 6 #
# par(mfrow = c(2, 3), mar = c(0, 0, 2, 0))
# for (i in 1:6) {
#   new_cells <- calculate_grid(shape = original_shapes, grid_type = "hexagonal", seed = i)
#   plot(new_cells, main = paste("Seed", i, sep = " "))
# }

# Calculate Grid
    new_cells_hex <- calculate_grid(shape = shp, grid_type = "hexagonal", seed = 6)

# Assign Polygons
    ## assign_polygons() doesn't complete here, sometimes crashes. takes extremely long time to run
    # resulthex3 <- assign_polygons(shp, new_cells_hex) 
    #####################

##
# Where the delay is: hungarian_costmin within assign_polygons()
##

# within assign_polygons.sf, which converts to spdf, and uses
  # assign_polygons.SpatialPolygonsDataFrame()
# running equivalent contents line by line, just 1st element / polygon within loop

  shape <- shp %>% as('Spatial')
  original_points <- rgeos::gCentroid(spf, byid = TRUE)
  shape@data$CENTROIX <- original_points$x
  shape@data$CENTROIY <- original_points$y
  shape@data$key_orig <- paste(original_points$x, original_points$y, 
                               sep = "_")
  
  new_polygons <- new_cells_hex
  new_points <- new_polygons[[1]]
  vector_length <- length(shape)
  new_polygons2 <- new_polygons[[2]]
  s_poly <- sp::SpatialPolygonsDataFrame(new_polygons2, as.data.frame(sp::coordinates(new_polygons2)))
  s_poly$key_new <- paste(s_poly@data$V1, s_poly@data$V2, 
                          sep = "_")
  closest_site_vec <- vector(mode = "numeric", length = vector_length)
  min_dist_vec <- vector(mode = "numeric", length = vector_length)
  taken_vec <- vector(mode = "numeric", length = vector_length)
  taken_vec_index <- integer(vector_length)

## within loop ##
  i <- 1 # just first iteration
  dist_vec <- sp::spDistsN1(original_points, new_points[i], 
                            longlat = FALSE)
  min_dist_vec[i] <- min(dist_vec)
    closest_site_vec[i] <- which.min(dist_vec)

  taken_vec[i] <- which.min(dist_vec)
  taken_vec_index <- taken_vec[taken_vec > 0]
  costmatrix <- sp::spDists(original_points, new_points, 
                            longlat = FALSE)
  colnames(costmatrix) <- paste(s_poly@data$V1, s_poly@data$V2, 
                                sep = "_")
  rownames(costmatrix) <- paste(original_points@coords[, 1], original_points@coords[, 2], sep = "_")
  #################################################
  ### everything up to here evaluates instantly ###
  #################################################

  system.time(hungarian_costmin <- hungarian_cc(costmatrix)) # Just one element takes 10.8 s, MBP 2.2 GHz i7 16 GB RAM
  # user  system elapsed 
  # 10.796   0.033  10.876 
  
  # end loop
  # 11 * 436 obs ~ 4796 seconds / 80 minutes

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions