dat-ecosystem-archive / svalbard

A global metadata vault [ DEPRECATED - More info on active projects and modules at https://dat-ecosystem.org/ ]
62 stars 6 forks source link

CKAN metadata exporter #9

Open max-mapper opened 7 years ago

max-mapper commented 7 years ago

Need a CLI that can monitor metadata for a CKAN instance regularly and produce complete exports of metadata in NDJSON format.

For Data.gov I used this script (note the query here does not get all items as Data.gov has sub-organizations with datasets that this query ignores, not sure if that's a general CKAN concept or just Data.gov specific):

var request = require('request')
var fs = require('fs')
var ndjson = require('ndjson')

var serialize = ndjson.serialize()
var write = fs.createWriteStream('./meta.json')

serialize.pipe(write)

var current = 0
var rows = 1000
var delay = 1000
var limit = 200000

function page (start) {
  return `http://catalog.data.gov/api/3/action/package_search?rows=${rows}&start=${start}`
}

function go () {
  var url = page(current)
  console.log('GET', url)
  request({url: url, json: true}, function (err, resp, body) {
    if (err) throw err
    if (resp.statusCode !== 200) throw new Error(body)
    if (!body.result.results.length) throw new Error(body)
    current += rows
    body.result.results.forEach(function (r) {
      serialize.write(r)      
    })
    if (current > limit) {
      console.log('done')
      serialize.end()
    } else {
      setTimeout(go, delay)
    }
  })
}

go()