Closed garethrees closed 8 years ago
Be good to wikify the scripts somehow
Didn't use an ignore file or do any manual cleanup (other than what you see in the script below):
#!/usr/bin/env ruby
# -*- encoding : utf-8 -*-
require 'csv'
# gem install unicode_utils
require 'unicode_utils/downcase'
require 'unicode_utils/titlecase'
def main(csv, ignore)
ignore_list = ignore ? File.read(ignore).split("\n") : []
puts print_new_csv(clean_csv(csv, ignore_list))
end
def clean_csv(csv, ignore_list = [])
rows = []
CSV.foreach(csv, headers: true, header_converters: :symbol) do |row|
next if ignore_list.include?(row[:name])
cleaned_data = {}
cleaned_data[:name] = clean_name(row[:nombre_institucion].to_s)
cleaned_data[:short_name] = clean_short_name(row[:sigla].to_s)
cleaned_data[:request_email] = clean_email(row[:email].to_s)
cleaned_data[:home_page] = clean_homepage(row[:url].to_s)
cleaned_data[:notes] = make_notes(row[:direccion].to_s)
cleaned_data[:tag_string] = make_tag_string(row[:sector].to_s)
manually_cleaned_data = manual_fixups(cleaned_data)
rows << manually_cleaned_data unless manually_cleaned_data.nil?
end
rows
end
def clean_name(str)
UnicodeUtils.titlecase(str)
end
def clean_short_name(str)
#if str.empty?
#nil
#else
#UnicodeUtils.downcase(str)
#end
# Too many duplicates to handle
nil
end
def clean_email(str)
return nil if str.empty?
str = str.split(' ').first.strip.gsub(',', '')
str = str.split(';').first.strip
str.gsub!(/\W\z/,'')
if str.reverse.chr =~ /\W/
raise "Email included non-word at end: #{str}"
else
str
end
end
def clean_homepage(str)
return nil if str.empty?
fixed = if str.start_with?('http')
str
else
"http://#{ str }"
end
fixed.strip
end
def make_notes(address)
unless address.empty?
%Q(<strong>Direccion:</strong> #{ address.strip })
end
end
def make_tag_string(str)
return nil if str.strip.downcase == 'no aplica' || str.empty?
str.split(', ').map { |tag| UnicodeUtils.downcase(tag.gsub(' ','_')) }.join(' ')
end
def manual_fixups(data)
if data[:name] == 'Personeria Municipal Jericó' && data[:home_page] == 'http://www.jerico-antioquia.gov.co/'
data[:name] = 'Personeria Municipal Jericó - Antioquia'
return data
end
if data[:name] == 'Personeria Municipal Jericó' && data[:home_page] == 'http://www.jerico-boyaca.gov.co'
data[:name] = 'Personeria Municipal Jericó - Boyacá'
return data
end
if data[:name] == 'Personeria Municipal San Francisco' && data[:home_page] == 'http://sanfrancisco-antioquia.gov.co/Personeria.shtml'
data[:name] = 'Personeria Municipal San Francisco - Antioquia'
return data
end
if data[:name] == 'Personeria Municipal San Francisco' && data[:home_page] == 'http://www.sanfrancisco-putumayo.gov.co'
data[:name] = 'Personeria Municipal San Francisco - Putumayo'
return data
end
if data[:name] == 'Secretaria Distrital De Salud' && data[:request_email] == 'contabilidad@shd.gov.co'
# Duplicate
return nil
end
if data[:name] == 'E.s.e. Hospital El Carmen -Amalfi' && data[:request_email] == 'paramillo209@gmail.com'
# Duplicate
return nil
end
if data[:name] == 'E.s.e Hospital Local Del Bolivar -Santander' && data[:request_email] == nil
# Duplicate
return nil
end
if data[:name] == 'E.s.e. Hospital San Antonio - Natagaima' && data[:notes] == %q(<strong>Direccion:</strong> Calle 5 con Carrera 11)
# Duplicate
return nil
end
if data[:name] == 'E.s.e Hospital Universitario Del Caribe' && data[:request_email].nil?
# Duplicate
return nil
end
if data[:name] == 'E.s.p Empresa De Servicios Publicos La Union'
# Duplicate
return nil
end
if data[:name] == 'Personeria Municipal De Belén Boyacá'
# Duplicate
return nil
end
if data[:name] == 'Personería Municipal El Zulia'
# Duplicate
return nil
end
if data[:name] == 'Personeria Municipal Soplaviento' && data[:request_email] == 'miladisbc25@yahoo.es'
# Duplicate
return nil
end
if data[:name] == 'E.s.e Centro De Salud San Miguel' && data[:request_email] == '0@0'
# duplicate; invalid email
return nil
end
if data[:name] == 'E.s.e. Centro De Salud San Sebastián' && data[:notes] == '<strong>Direccion:</strong> calle 2 # 4 - 64'
# Duplicate
return nil
end
if data[:name] == 'E.s.e Hospital Nuestra Señora Del Carmen'
# Not sure, so just add tag suffix
data[:name] = 'E.s.e Hospital Nuestra Señora Del Carmen tabio'
return data
end
if data[:name] == 'E.s.e Hospital Nuevo Horizonte' && data[:request_email] == 'luisfernandovargas2005@hotmail.com'
# Looks like a duplicate
return nil
end
if data[:name] == 'Personería Municipal Murillo' && data[:request_email].nil?
# Duplicate
return nil
end
if data[:request_email] == 'alcaldia@sandona_narino.gov.co'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'elbanco-magadalena.gov.co'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'alcaldía@nocaima-cundinamarca.gov.co'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'haroldcaleño2993@hotmail.com'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'Carrera'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'fmuñoz@esepaf.gov.co'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'janella-hotmail.com'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'http://indeportesguajira.gov.co/contactenos'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'alcaldía@gama-cundinamarca.gov.co'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
if data[:request_email] == 'personeriamomil'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'www.electrohuila.com.co/Login.aspx?ReturnUrl=%2fP_Q_R.aspx'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'http://www.urra.com.co/Contacteno.php'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'www.conif.org.co/contactenos.php'
# Invalid email
data[:request_email] = nil
return data
end
if data[:request_email] == 'corporación@parquearvi.org'
# Unicode email https://git.io/vrKTJ
data[:request_email] = nil
data[:tag_string] = data[:tag_string].to_s.split(' ').push('unicode_email').join(' ')
return data
end
data
end
def print_new_csv(data)
headers = data.first.keys
headers[0] = "##{ headers[0] }"
CSV.generate(headers: headers) do |csv|
csv << headers
data.each do |d|
csv << d.values
end
end
end
csv = ARGV[0]
ignore = ARGV[1]
if csv.nil? || !File.exist?(csv)
puts "File does not exist: #{ csv }"
exit 1
end
if ignore && !File.exist?(ignore)
puts "File does not exist: #{ ignore }"
exit 1
end
main(csv, ignore)
A handful couldn't be imported because of https://github.com/mysociety/alaveteli/issues/2957.
https://git.io/vrKTJ
above redirects to https://github.com/mysociety/alaveteli/issues/2684
Source will need cleanup similar to https://github.com/mysociety/bilmehakki-theme/issues/11