PHACDataHub / pelias-canada

https://geocoder.alpha.phac.gc.ca/
0 stars 1 forks source link

create Dev script - R for reverse geocoding #140

Closed NickZachary closed 1 week ago

NickZachary commented 1 month ago
### Tasks
- [ ] https://github.com/PHACDataHub/pelias-canada/issues/141
NickZachary commented 1 month ago
library(httr)
library(jsonlite)
library(dplyr)
library(readr)

# Function to fetch reverse geocoding data with validation and maximum distance
reverse_geocode <- function(lat, lon, max_distance_km = 2.5) {
  if (!(lat >= -90 && lat <= 90) || !(lon >= -180 && lon <= 180)) {
    print(paste("Invalid latitude (", lat, ") or longitude (", lon, "). Skipping.", sep = ""))
    return(NULL)
  }

  base_url <- "https://geocoder.alpha.phac.gc.ca/api/v1/reverse"
  params <- list(
    `point.lat` = lat,
    `point.lon` = lon,
    `maxDistance` = max_distance_km * 1000  # Convert km to meters
  )

  # Construct URL with query parameters
  url <- paste0(base_url, "?", paste(names(params), unlist(params), sep = "=", collapse = "&"))

  print(paste("Request URL: ", url))  # Debugging line

  tryCatch({
    response <- GET(url)
    stop_for_status(response)
    return(fromJSON(content(response, "text", encoding = "UTF-8"), flatten = TRUE))
  }, error = function(e) {
    print(paste("Failed to fetch geocoding data: ", e$message, sep = ""))
    return(NULL)
  })
}

# Function to prioritize and rank features
prioritize_features <- function(features_df) {
  if (nrow(features_df) == 0) {
    return(NULL)
  }

  # Ensure coordinates are handled correctly
  features_df <- features_df %>%
    mutate(
      lat = sapply(geometry.coordinates, function(coord) if (length(coord) == 2) coord[2] else NA),
      lon = sapply(geometry.coordinates, function(coord) if (length(coord) == 2) coord[1] else NA)
    )

  sorted_features <- features_df %>%
    arrange(desc(properties.confidence), properties.distance)

  top_feature <- sorted_features[1,]

  if (is.null(top_feature$lat) || is.null(top_feature$lon)) {
    print("Error: Coordinates are missing in the top feature.")
    return(NULL)
  }

  result <- list(
    ddLat = as.character(top_feature$lat),
    ddLong = as.character(top_feature$lon),
    name = top_feature$properties.name,
    housenumber = top_feature$properties.housenumber,
    confidence = top_feature$properties.confidence,
    distance = top_feature$properties.distance,
    accuracy = top_feature$properties.accuracy,
    country = top_feature$properties.country,
    region = top_feature$properties.region,
    region_a = top_feature$properties.region_a,
    county = top_feature$properties.county,
    county_gid = top_feature$properties.county_gid,
    locality = top_feature$properties.locality,
    neighbourhood = top_feature$properties.neighbourhood,
    label = top_feature$properties.label
  )

  return(result)
}

# Function to calculate total confidence levels
calculate_total_confidence <- function(features_df) {
  confidence_data <- c('0.0-0.5' = 0, '0.5-0.8' = 0, '0.8-0.95' = 0, '1.0' = 0)

  features_df <- features_df %>%
    mutate(confidence = as.numeric(properties.confidence))  # Ensure confidence is numeric

  for (i in 1:nrow(features_df)) {
    confidence <- features_df$confidence[i]
    if (confidence >= 1.0) {
      confidence_data['1.0'] <- confidence_data['1.0'] + 1
    } else if (confidence >= 0.8) {
      confidence_data['0.8-0.95'] <- confidence_data['0.8-0.95'] + 1
    } else if (confidence >= 0.5) {
      confidence_data['0.5-0.8'] <- confidence_data['0.5-0.8'] + 1
    } else if (confidence >= 0.0) {
      confidence_data['0.0-0.5'] <- confidence_data['0.0-0.5'] + 1
    }
  }

  return(confidence_data)
}

calculate_match_rate <- function(confirmed_count, submitted_count) {
  if (submitted_count > 0) {
    return(confirmed_count / submitted_count * 100)
  } else {
    return(0.0)
  }
}

classify_accuracy <- function(confidence) {
  if (confidence >= 1.0) {
    return('100%')
  } else if (confidence >= 0.8) {
    return('80%-99.999%')
  } else if (confidence >= 0.5) {
    return('50%-79.999%')
  } else if (confidence >= 0.1) {
    return('0.1%-49.999%')
  } else {
    return('No Match')
  }
}

# Function to process CSV and save results
process_csv <- function(input_file, output_file) {
  df <- read_csv(input_file, show_col_types = FALSE)
  results <- list()
  total_confidence <- c('0.0-0.5' = 0, '0.5-0.8' = 0, '0.8-0.95' = 0, '1.0' = 0)
  confirmed_count <- 0
  submitted_count <- nrow(df)
  match_accuracy <- c('100%' = 0, '80%-99.999%' = 0, '50%-79.999%' = 0, '0.1%-49.999%' = 0, 'No Match' = 0)

  for (i in 1:nrow(df)) {
    row <- df[i,]
    if (!is.na(row$ddLat) && !is.na(row$ddLong)) {
      lat <- as.numeric(row$ddLat)
      lon <- as.numeric(row$ddLong)

      if (!(lat >= -90 && lat <= 90) || !(lon >= -180 && lon <= 180)) {
        print(paste("Invalid latitude (", lat, ") or longitude (", lon, ") in row ", i, ". Skipping.", sep = ""))
        next
      }

      geocoding_data <- reverse_geocode(lat, lon, max_distance_km)

      if (!is.null(geocoding_data)) {
        features_df <- geocoding_data$features
        if (is.data.frame(features_df)) {
          print(str(features_df))  # Debugging line to check the structure
          result <- prioritize_features(features_df)

          if (!is.null(result)) {
            result$RowNumber <- as.character(i)
            results <- append(results, list(result))

            # Update total confidence levels and confirmed count
            confidence_data <- calculate_total_confidence(features_df)
            total_confidence <- total_confidence + confidence_data

            if (as.numeric(result$confidence) >= 1.0) {
              confirmed_count <- confirmed_count + 1
            }

            accuracy_label <- classify_accuracy(as.numeric(result$confidence))
            match_accuracy[accuracy_label] <- match_accuracy[accuracy_label] + 1
          }
        } else {
          print("Unexpected format for features data")
        }
      }
    }
  }

  match_rate <- calculate_match_rate(confirmed_count, submitted_count)

  output_df <- bind_rows(results)
  write_csv(output_df, output_file)

  epoch_time <- as.integer(Sys.time())
  metadata_file <- paste0(gsub(".csv", "", output_file), "_meta.csv")

  metadata_df <- data.frame(
    Metadata_Information = c(
      paste('Epoch Time', epoch_time),
      paste('Match Rate', paste0(confirmed_count, '/', submitted_count, ' (', match_rate, '%)')),
      paste('Total Requests', submitted_count),
      paste('100% Match', match_accuracy['100%']),
      paste('80%-99.999% Match', match_accuracy['80%-99.999%']),
      paste('50%-79.999% Match', match_accuracy['50%-79.999%']),
      paste('0.1%-49.999% Match', match_accuracy['0.1%-49.999%']),
      paste('No Match', match_accuracy['No Match']),
      paste('Coordinate System', 'WGS 1984'),
      paste('Maximum Search Distance (km)', max_distance_km)
    ),
    stringsAsFactors = FALSE
  )

  # Ensure directory exists
  output_dir <- dirname(output_file)
  if (!dir.exists(output_dir)) {
    dir.create(output_dir, recursive = TRUE)
  }

  write_csv(metadata_df, metadata_file)

  output_csv <- paste0(gsub(".csv", "", output_file), '_', epoch_time, '.csv')
  write_csv(output_df, output_csv)

  print(paste("Output CSV file saved as ", output_csv, sep = ""))
  print(paste("Metadata file saved as ", metadata_file, sep = ""))
}

# Prompt for user input in Posit Cloud
print("Upload the input CSV file:")
input_file <- file.choose()

print("Please enter the name of the output CSV file (without extension):")
output_file_name <- readline()

output_dir <- '/content'
max_distance_km <- 2.5

process_csv(input_file, paste0(output_file_name, ".csv"))