Closed WolfgangFahl closed 4 years ago
http://wiki.bitplan.com/images/confident/City_wikidata.db now has a suitable database derived from a wikidata query:
# get a list of cities
# for use in the geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
# get City details with Country
SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGDP_perCapita ?city ?cityLabel ?coord ?cityPopulation ?date ?ratio WHERE {
# run for Paris as example only
# if you uncomment this line this query might run for some 3 hours on a local wikidata copy using Apache Jena
VALUES ?city {wd:Q90}.
# instance of City Q515
# instance of human settlement https://www.wikidata.org/wiki/Q486972
?city wdt:P31/wdt:P279* wd:Q486972 .
# label of the City
?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en").
# get the coordinates
?city wdt:P625 ?coord.
# country this city belongs to
?city wdt:P17 ?country .
# label for the country
?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
# https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
?country wdt:P297 ?countryIsoCode.
# population of country
?country wdt:P1082 ?countryPopulation.
# https://www.wikidata.org/wiki/Property:P2132
# nonminal GDP per capita
?country wdt:P2132 ?countryGDP_perCapita.
# population of city
?city p:P1082 ?populationStatement .
?populationStatement ps:P1082 ?cityPopulation.
?populationStatement pq:P585 ?date
FILTER NOT EXISTS { ?city p:P1082/pq:P585 ?date_ . FILTER (?date_ > ?date) }
BIND ( concat(str(round(10000*?cityPopulation/?countryPopulation)/100), '%') AS ?ratio)
}
try it! for example city Paris
pragma table_info('city_wikidata')
cid name type
0 country TEXT
1 countryLabel TEXT
2 countryIsoCode TEXT
3 countryPopulation FLOAT
4 countryGDP_perCapita FLOAT
5 coord TEXT
6 cityPopulation FLOAT
7 date TIMESTAMP
8 ratio TEXT
9 wikidataurl TEXT
10 name TEXT
select count(*),countryIsoCode
from City_wikidata
group by countryIsoCode
order by 1 desc
limit 10
count(*) countryIsoCode
38248 FR
29450 US
17105 AZ
15603 LT
14916 BY
14134 CZ
13930 RO
13818 DE
12544 AU
10067 RU
select count(*) as qty,name
from city_wikidata
group by name
having count(*)>1
order by 1 desc
qty name
111 Artyom
110 Gilgilçay
110 Məmmədxanlı
110 Qaraçala
110 Qiyaməddinli
110 Təklə
110 Xanəgah
110 Xaçmaz
62 Novonikolayevka
58 Liman
with 26941 possible duplicate human settlement names that's about 10% of all settlement names in the dataset
Disambiguation example for Paris (which shows us that we also need region information ...!)
select wikidataurl,cityPopulation,countryLabel,countryIsoCode,coord
from city_wikidata
where name='Paris'
order by cityPopulation desc
http://www.wikidata.org/entity/Q90 2187526.0 France FR Point(2.3513888888889 48.856944444444)
http://www.wikidata.org/entity/Q830149 25171.0 United States of America US Point(-95.547692 33.662508)
http://www.wikidata.org/entity/Q3305306 13448.0 Canada CA Point(-80.3833 43.2)
http://www.wikidata.org/entity/Q1018504 9763.0 United States of America US Point(-88.313888888889 36.301111111111)
http://www.wikidata.org/entity/Q576584 8837.0 United States of America US Point(-87.6961 39.6111)
http://www.wikidata.org/entity/Q934294 5183.0 United States of America US Point(-70.500555555556 44.259722222222)
http://www.wikidata.org/entity/Q538772 4411.0 United States of America US Point(-75.313888888889 43.000555555556)
http://www.wikidata.org/entity/Q79917 3532.0 United States of America US Point(-93.726111111111 35.291666666667)
http://www.wikidata.org/entity/Q2220917 1473.0 United States of America US Point(-88.013055555556 42.616944444444)
http://www.wikidata.org/entity/Q960025 1220.0 United States of America US Point(-92.0011 39.4803)
http://www.wikidata.org/entity/Q2219239 754.0 United States of America US Point(-90.610277777778 42.650277777778)
http://www.wikidata.org/entity/Q7137172 732.0 United States of America US Point(-80.5125 40.403333333333)
http://www.wikidata.org/entity/Q984459 576.0 United States of America US Point(-111.399 42.2278)
http://www.wikidata.org/entity/Q13107162 73.0 Sweden SE Point(18.799 59.7594)
Improved query:
# get a list of cities
# for geograpy3 library
# see https://github.com/somnathrakshit/geograpy3/issues/15
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pq: <http://www.wikidata.org/prop/qualifier/>
# get City details with Country
SELECT DISTINCT ?country ?countryLabel ?countryIsoCode ?countryPopulation ?countryGDP_perCapita ?region ?regionLabel ?regionIsoCode ?city ?cityLabel ?coord ?cityPopulation ?date ?ratio WHERE {
# run for Paris as example only
# if you uncomment this line this query might run for some 3 hours on a local wikidata copy using Apache Jena
VALUES ?city {wd:Q90}.
# instance of City Q515
# instance of human settlement https://www.wikidata.org/wiki/Q486972
?city wdt:P31/wdt:P279* wd:Q486972 .
# label of the City
?city rdfs:label ?cityLabel filter (lang(?cityLabel) = "en").
# get the coordinates
?city wdt:P625 ?coord.
# region this country belongs to
# https://www.wikidata.org/wiki/Property:P361
OPTIONAL {
# part of
# https://www.wikidata.org/wiki/Property:P361
?city wdt:P131 ?region.
# first order region
?region wdt:P31/wdt:P279* wd:Q10864048.
?region rdfs:label ?regionLabel filter (lang(?regionLabel) = "en").
?region wdt:P300 ?regionIsoCode
}
# country this city belongs to
?city wdt:P17 ?country .
# label for the country
?country rdfs:label ?countryLabel filter (lang(?countryLabel) = "en").
# https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
?country wdt:P297 ?countryIsoCode.
# population of country
?country wdt:P1082 ?countryPopulation.
# https://www.wikidata.org/wiki/Property:P2132
# nonminal GDP per capita
?country wdt:P2132 ?countryGDP_perCapita.
# population of city
?city p:P1082 ?populationStatement .
?populationStatement ps:P1082 ?cityPopulation.
?populationStatement pq:P585 ?date
FILTER NOT EXISTS { ?city p:P1082/pq:P585 ?date_ . FILTER (?date_ > ?date) }
BIND ( concat(str(round(10000*?cityPopulation/?countryPopulation)/100), '%') AS ?ratio)
}
The query runs 3.5 h on my 12 core 64 GB machine with the 4TB SSD disk holding a local wikidata copy in Apache Jena and retrieves some 280147 cities
When looking for non unique cities allow to disambiguate via population and or gdp per capita data.
Thus Vienna and Paris would find the major cities in Austria and France by default given the much higher population of these cities if no other disambiguation information is available.