mysociety / colombia-theme

The Alaveteli theme for QueremosDatos (Colombia)
http://www.queremosdatos.co/
MIT License
0 stars 1 forks source link

Import authorities #4

Closed garethrees closed 8 years ago

garethrees commented 8 years ago

Source will need cleanup similar to https://github.com/mysociety/bilmehakki-theme/issues/11

garethrees commented 8 years ago

Be good to wikify the scripts somehow

garethrees commented 8 years ago

Didn't use an ignore file or do any manual cleanup (other than what you see in the script below):

#!/usr/bin/env ruby
# -*- encoding : utf-8 -*-
require 'csv'
# gem install unicode_utils
require 'unicode_utils/downcase'
require 'unicode_utils/titlecase'

def main(csv, ignore)
  ignore_list = ignore ? File.read(ignore).split("\n") : []
  puts print_new_csv(clean_csv(csv, ignore_list))
end

def clean_csv(csv, ignore_list = [])
  rows = []
  CSV.foreach(csv, headers: true, header_converters: :symbol) do |row|
    next if ignore_list.include?(row[:name])
    cleaned_data = {}
    cleaned_data[:name] = clean_name(row[:nombre_institucion].to_s)
    cleaned_data[:short_name] = clean_short_name(row[:sigla].to_s)
    cleaned_data[:request_email] = clean_email(row[:email].to_s)
    cleaned_data[:home_page] = clean_homepage(row[:url].to_s)
    cleaned_data[:notes] = make_notes(row[:direccion].to_s)
    cleaned_data[:tag_string] = make_tag_string(row[:sector].to_s)
    manually_cleaned_data = manual_fixups(cleaned_data)
    rows << manually_cleaned_data unless manually_cleaned_data.nil?
  end
  rows
end

def clean_name(str)
  UnicodeUtils.titlecase(str)
end

def clean_short_name(str)
  #if str.empty?
    #nil
  #else
    #UnicodeUtils.downcase(str)
  #end

  # Too many duplicates to handle
  nil
end

def clean_email(str)
  return nil if str.empty?
  str = str.split(' ').first.strip.gsub(',', '')
  str = str.split(';').first.strip
  str.gsub!(/\W\z/,'')

  if str.reverse.chr =~ /\W/
    raise "Email included non-word at end: #{str}"
  else
    str
  end
end

def clean_homepage(str)
  return nil if str.empty?

  fixed = if str.start_with?('http')  
    str
  else
    "http://#{ str }"
  end

  fixed.strip
end

def make_notes(address)
  unless address.empty?
    %Q(<strong>Direccion:</strong> #{ address.strip })
  end
end

def make_tag_string(str)
  return nil if str.strip.downcase == 'no aplica' || str.empty?
  str.split(', ').map { |tag| UnicodeUtils.downcase(tag.gsub(' ','_')) }.join(' ')
end

def manual_fixups(data)
  if data[:name] == 'Personeria Municipal Jericó' && data[:home_page] == 'http://www.jerico-antioquia.gov.co/'
    data[:name] = 'Personeria Municipal Jericó - Antioquia'
    return data
  end

  if data[:name] == 'Personeria Municipal Jericó' && data[:home_page] == 'http://www.jerico-boyaca.gov.co'
    data[:name] = 'Personeria Municipal Jericó - Boyacá'
    return data
  end

  if data[:name] == 'Personeria Municipal San Francisco' && data[:home_page] == 'http://sanfrancisco-antioquia.gov.co/Personeria.shtml'
    data[:name] = 'Personeria Municipal San Francisco - Antioquia'
    return data
  end

  if data[:name] == 'Personeria Municipal San Francisco' && data[:home_page] == 'http://www.sanfrancisco-putumayo.gov.co'
    data[:name] = 'Personeria Municipal San Francisco - Putumayo'
    return data
  end

  if data[:name] == 'Secretaria Distrital De Salud' && data[:request_email] == 'contabilidad@shd.gov.co'
    # Duplicate
    return nil
  end

  if data[:name] == 'E.s.e. Hospital  El Carmen  -Amalfi' && data[:request_email] == 'paramillo209@gmail.com'
    # Duplicate
    return nil
  end

  if data[:name] == 'E.s.e Hospital Local Del Bolivar  -Santander' && data[:request_email] == nil
    # Duplicate
    return nil
  end

  if data[:name] == 'E.s.e. Hospital  San Antonio  - Natagaima' && data[:notes] == %q(<strong>Direccion:</strong> Calle 5 con Carrera 11)
    # Duplicate
    return nil
  end

  if data[:name] == 'E.s.e Hospital Universitario Del Caribe' && data[:request_email].nil?
    # Duplicate
    return nil
  end

  if data[:name] == 'E.s.p Empresa De Servicios Publicos La Union'
    # Duplicate
    return nil
  end

  if data[:name] == 'Personeria Municipal De Belén Boyacá'
    # Duplicate
    return nil
  end

  if data[:name] == 'Personería Municipal El Zulia'
    # Duplicate
    return nil
  end

  if data[:name] == 'Personeria Municipal Soplaviento' && data[:request_email] == 'miladisbc25@yahoo.es'
    # Duplicate
    return nil
  end

  if data[:name] == 'E.s.e Centro De Salud San Miguel' && data[:request_email] == '0@0'
    # duplicate; invalid email
    return nil
  end

  if data[:name] == 'E.s.e. Centro De Salud San Sebastián' && data[:notes] == '<strong>Direccion:</strong> calle 2 # 4 - 64'
    # Duplicate
    return nil
  end

  if data[:name] == 'E.s.e Hospital Nuestra Señora Del Carmen'
    # Not sure, so just add tag suffix
    data[:name] = 'E.s.e Hospital Nuestra Señora Del Carmen tabio'
    return data
  end

  if data[:name] == 'E.s.e Hospital Nuevo Horizonte' && data[:request_email] == 'luisfernandovargas2005@hotmail.com'
    # Looks like a duplicate
    return nil
  end

  if data[:name] == 'Personería Municipal Murillo' && data[:request_email].nil?
    # Duplicate
    return nil
  end

  if data[:request_email] == 'alcaldia@sandona_narino.gov.co'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'elbanco-magadalena.gov.co'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'alcaldía@nocaima-cundinamarca.gov.co'
    # Unicode email https://git.io/vrKTJ
    data[:request_email] = nil
    data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
    return data
  end

  if data[:request_email] == 'haroldcaleño2993@hotmail.com'
    # Unicode email https://git.io/vrKTJ
    data[:request_email] = nil
    data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
    return data
  end

  if data[:request_email] == 'Carrera'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'fmuñoz@esepaf.gov.co'
    # Unicode email https://git.io/vrKTJ
    data[:request_email] = nil
    data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
    return data
  end

  if data[:request_email] == 'janella-hotmail.com'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'http://indeportesguajira.gov.co/contactenos'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'alcaldía@gama-cundinamarca.gov.co'
    # Unicode email https://git.io/vrKTJ
    data[:request_email] = nil
    data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
    return data
  end

  if data[:request_email] == 'personeriamomil'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'www.electrohuila.com.co/Login.aspx?ReturnUrl=%2fP_Q_R.aspx'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'http://www.urra.com.co/Contacteno.php'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'www.conif.org.co/contactenos.php'
    # Invalid email
    data[:request_email] = nil
    return data
  end

  if data[:request_email] == 'corporación@parquearvi.org'
    # Unicode email https://git.io/vrKTJ
    data[:request_email] = nil
    data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
    return data
  end

  data
end

def print_new_csv(data)
  headers = data.first.keys
  headers[0] = "##{ headers[0] }"
  CSV.generate(headers: headers) do |csv|
    csv << headers
    data.each do |d|
      csv << d.values
    end
  end
end

csv = ARGV[0]
ignore = ARGV[1]

if csv.nil? || !File.exist?(csv)
  puts "File does not exist: #{ csv }"
  exit 1
end

if ignore && !File.exist?(ignore)
  puts "File does not exist: #{ ignore }"
  exit 1
end

main(csv, ignore)
garethrees commented 8 years ago

A handful couldn't be imported because of https://github.com/mysociety/alaveteli/issues/2957.

garethrees commented 2 years ago

https://git.io/vrKTJ above redirects to https://github.com/mysociety/alaveteli/issues/2684