datagouv / csv-detective

CSV inspection
45 stars 10 forks source link

Wrong type and header line detection #44

Closed abulte closed 1 year ago

abulte commented 1 year ago

http://data.caf.fr/dataset/f6411f07-10bf-4f13-b4fb-8d30ba9328b5/resource/94a182c4-19c8-4d3a-987c-187a49756365/download/txcouvglo2014.csv

[:~] $ head /Users/alexandre/Downloads/txcouvglo2014.csv
NUMCOM;NOMCOM;NUMDEP;NOMDEP;NUMEPCI;NOMEPCI;TXCOUVGLO_COM_2014;TXCOUVGLO_DEP_2014;TXCOUVGLO_EPCI_2014
NUMCOM;NOMCOM;NUMDEP;NOMDEP;NUMEPCI;NOMEPCI;TXCOUVGLO_COM_2014;TXCOUVGLO_DEP_2014;TXCOUVGLO_EPCI_2014
01001;L'ABERGEMENT-CLEMENCIAT;01;AIN;200035210;CC CHALARONNE CENTRE;41.7;65.2;72.9
01002;L'ABERGEMENT-DE-VAREY;01;AIN;240100883;CC DE LA PLAINE DE L'AIN;34.1;65.2;75.2
01004;AMBERIEU-EN-BUGEY;01;AIN;240100883;CC DE LA PLAINE DE L'AIN;61.8;65.2;75.2
01005;AMBERIEUX-EN-DOMBES;01;AIN;200042497;CC DOMBES SAONE VALLEE;73.6;65.2;77.8
01006;AMBLEON;01;AIN;200040350;CC BUGEY SUD;93.1;65.2;52.4
01007;AMBRONAY;01;AIN;240100883;CC DE LA PLAINE DE L'AIN;51.4;65.2;75.2
01008;AMBUTRIX;01;AIN;240100883;CC DE LA PLAINE DE L'AIN;92;65.2;75.2
01009;ANDERT-ET-CONDON;01;AIN;200040350;CC BUGEY SUD;34.2;65.2;52.4
[:~] $ tail /Users/alexandre/Downloads/txcouvglo2014.csv
97415;SAINT-PAUL;974;LA REUNION;249740101;CA TERRITOIRE DE LA COTE OUEST (TCO);33.2;25.8;29
97416;SAINT-PIERRE;974;LA REUNION;249740077;CA CIVIS (COMMUNAUTE INTERCOMMUNALE DES VILLES SOLIDAIRES);34.5;25.8;25.8
97417;SAINT-PHILIPPE;974;LA REUNION;249740085;CA DU SUD;16.9;25.8;18.4
97418;SAINTE-MARIE;974;LA REUNION;249740119;CA INTERCOMMUNALE DU NORD DE LA REUNION (CINOR);32;25.8;31.1
97419;SAINTE-ROSE;974;LA REUNION;249740093;CA INTERCOMMUNALE DE LA REUNION EST (CIREST);17.2;25.8;20
97420;SAINTE-SUZANNE;974;LA REUNION;249740119;CA INTERCOMMUNALE DU NORD DE LA REUNION (CINOR);28.1;25.8;31.1
97421;SALAZIE;974;LA REUNION;249740093;CA INTERCOMMUNALE DE LA REUNION EST (CIREST);17.7;25.8;20
97422;LE TAMPON;974;LA REUNION;249740085;CA DU SUD;20.3;25.8;18.4
97423;LES TROIS-BASSINS;974;LA REUNION;249740101;CA TERRITOIRE DE LA COTE OUEST (TCO);14.3;25.8;29
97424;CILAOS;974;LA REUNION;249740077;CA CIVIS (COMMUNAUTE INTERCOMMUNALE DES VILLES SOLIDAIRES);9.1;25.8;25.8
[:~] $ grep "2A" /Users/alexandre/Downloads/txcouvglo2014.csv
2A001;AFA;2A;CORSE DU SUD;242010056;CA DU PAYS AJACCIEN;32.6;35.8;27.6
2A004;AJACCIO;2A;CORSE DU SUD;242010056;CA DU PAYS AJACCIEN;29.5;35.8;27.6
2A006;ALATA;2A;CORSE DU SUD;242010056;CA DU PAYS AJACCIEN;20.8;35.8;27.6
{
   "header":[
      "NUMCOM",
      "NOMCOM",
      "NUMDEP",
      "NOMDEP",
      "NUMEPCI",
      "NOMEPCI",
      "TXCOUVGLO_COM_2014",
      "TXCOUVGLO_DEP_2014",
      "TXCOUVGLO_EPCI_2014"
   ],
   "columns":{
      "NOMCOM":{
         "score":1.0,
         "format":"commune",
         "python_type":"string"
      },
      "NOMDEP":{
         "score":1.0,
         "format":"departement",
         "python_type":"string"
      },
      "NUMCOM":{
         "score":1.0,
         "format":"int",
         "python_type":"int"
      },
      "NUMDEP":{
         "score":1.0,
         "format":"int",
         "python_type":"int"
      },
      "NOMEPCI":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "NUMEPCI":{
         "score":1.0,
         "format":"siren",
         "python_type":"string"
      },
      "TXCOUVGLO_COM_2014":{
         "score":1.0,
         "format":"float",
         "python_type":"float"
      },
      "TXCOUVGLO_DEP_2014":{
         "score":1.0,
         "format":"float",
         "python_type":"float"
      },
      "TXCOUVGLO_EPCI_2014":{
         "score":1.0,
         "format":"float",
         "python_type":"float"
      }
   },
   "formats":{
      "int":[
         "NUMCOM",
         "NUMDEP"
      ],
      "float":[
         "TXCOUVGLO_COM_2014",
         "TXCOUVGLO_DEP_2014",
         "TXCOUVGLO_EPCI_2014"
      ],
      "siren":[
         "NUMEPCI"
      ],
      "string":[
         "NOMEPCI"
      ],
      "commune":[
         "NOMCOM"
      ],
      "departement":[
         "NOMDEP"
      ]
   },
   "encoding":"ISO-8859-1",
   "separator":";",
   "continuous":[
      "TXCOUVGLO_DEP_2014",
      "TXCOUVGLO_EPCI_2014"
   ],
   "categorical":[

   ],
   "total_lines":36636,
   "columns_fields":{
      "NOMCOM":{
         "score":1.0,
         "format":"commune",
         "python_type":"string"
      },
      "NOMDEP":{
         "score":1.0,
         "format":"departement",
         "python_type":"string"
      },
      "NUMCOM":{
         "score":1.0,
         "format":"code_commune_insee",
         "python_type":"string"
      },
      "NUMDEP":{
         "score":1.0,
         "format":"code_departement",
         "python_type":"string"
      },
      "NOMEPCI":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "NUMEPCI":{
         "score":1.0,
         "format":"siren",
         "python_type":"string"
      },
      "TXCOUVGLO_COM_2014":{
         "score":1.0,
         "format":"float",
         "python_type":"float"
      },
      "TXCOUVGLO_DEP_2014":{
         "score":0.9183673469387755,
         "format":"latitude_wgs",
         "python_type":"float"
      },
      "TXCOUVGLO_EPCI_2014":{
         "score":0.9387755102040817,
         "format":"longitude_wgs",
         "python_type":"float"
      }
   },
   "columns_labels":{
      "NOMCOM":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "NOMDEP":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "NUMCOM":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "NUMDEP":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "NOMEPCI":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "NUMEPCI":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      },
      "TXCOUVGLO_COM_2014":{
         "score":0.5,
         "format":"code_commune_insee",
         "python_type":"string"
      },
      "TXCOUVGLO_DEP_2014":{
         "score":0.5,
         "format":"code_departement",
         "python_type":"string"
      },
      "TXCOUVGLO_EPCI_2014":{
         "score":1.0,
         "format":"string",
         "python_type":"string"
      }
   },
   "header_row_idx":0,
   "heading_columns":0,
   "trailing_columns":0
}
Pierlou commented 1 year ago
abulte commented 1 year ago

Tested with #48, works perfectly 👏

[:~/Developer/Etalab/udata-hydra] add-csv-detective-types(+8/-6)+ ± poetry run udata-hydra analyse-csv --url http://data.caf.fr/dataset/f6411f07-10bf-4f13-b4fb-8d30ba9328b5/resource/94a182c4-19c8-4d3a-987c-187a49756365/download/txcouvglo2014.csv
2023-02-07 10:40:30 dev.local asyncio[16702] DEBUG Using selector: KqueueSelector
2023-02-07 10:40:34 dev.local udata-hydra[16702] DEBUG Converting from CSV to db for 3e1b3d2ce6e2f7f16c3f7d2183a08eef
[:~/Developer/Etalab/udata-hydra] add-csv-detective-types(+8/-6)+ ± docker compose exec -it -u postgres database-csv psql
psql (11.2 (Debian 11.2-1.pgdg90+1))
Type "help" for help.

postgres=# \d 3e1b3d2ce6e2f7f16c3f7d2183a08eef
                                          Table "public.3e1b3d2ce6e2f7f16c3f7d2183a08eef
"
       Column        |       Type        | Collation | Nullable |
      Default
---------------------+-------------------+-----------+----------+-----------------------
-------------------------------------------
 __id                | integer           |           | not null | nextval('"3e1b3d2ce6e2
f7f16c3f7d2183a08eef___id_seq"'::regclass)
 NUMCOM              | character varying |           |          |
 NOMCOM              | character varying |           |          |
 NUMDEP              | character varying |           |          |
 NOMDEP              | character varying |           |          |
 NUMEPCI             | character varying |           |          |
 NOMEPCI             | character varying |           |          |
 TXCOUVGLO_COM_2014  | double precision  |           |          |
 TXCOUVGLO_DEP_2014  | double precision  |           |          |
 TXCOUVGLO_EPCI_2014 | double precision  |           |          |
Indexes:
    "3e1b3d2ce6e2f7f16c3f7d2183a08eef_pkey" PRIMARY KEY, btree (__id)

postgres=# select * FROM "3e1b3d2ce6e2f7f16c3f7d2183a08eef" limit 1;
 __id | NUMCOM |         NOMCOM          | NUMDEP | NOMDEP |  NUMEPCI  |       NOMEPCI
      | TXCOUVGLO_COM_2014 | TXCOUVGLO_DEP_2014 | TXCOUVGLO_EPCI_2014
------+--------+-------------------------+--------+--------+-----------+----------------
------+--------------------+--------------------+---------------------
    1 | 01001  | L'ABERGEMENT-CLEMENCIAT | 01     | AIN    | 200035210 | CC CHALARONNE C
ENTRE |               41.7 |               65.2 |                72.9
(1 row)