CodeForPhilly / jawn

'Git for Tabular Data'
http://datjawn.com
BSD 3-Clause "New" or "Revised" License
44 stars 9 forks source link

Transform input streams into consistent JSON objects #32

Closed flyingzumwalt closed 8 years ago

flyingzumwalt commented 8 years ago

Implement (or require) a stream that parses its input buffers and transforms input CSV, JSON, etc to json objects that will be written as blocks in our hypercore feeds.

There's Placeholder code for this stream in lib/import.js (see PR #31)

This is the functionality that parse-input-stream aims to support. If you can get that module to work, feel free to use it!

If you don't want to use parse-input-stream, you could use some of the libraries it depends on to parse csv, json, etc (look in the package.json). Otherwise, grab the CSV parsing code form last week. Check out the parse_csv branch for some code you could start with.

flyingzumwalt commented 8 years ago

For reference, here are some tests I wrote that trigger the whole importStream pipeline and test the results. Note It might be better to test the parseInputStream function on its own, and then test the import process more broadly.

Also see the tests for the csv-parser module. They're a good reference.

Import json

test('import json to jawn', function (t) {
  var jawn = freshJawn()
  var importStream = jawn.createImportStream({'format': 'json'})
  // Imitate the stream that would come from reading sample.json
  // importStream should parse the JSON correctly, transforming the content of each line into a JSON object
  // This is the same as doing
  //    var data = fs.createReadStream('./test/data/sample.json')
  //    data.pipe(importStream)
  // except the writes are being performed synchronously/inline so we can call importStream.end() after writing the contents into it.
  importStream.write('{foo: "bar", name: "josie"}')
  importStream.write('{foo: "baz", name: "eloise"}')
  importStream.write('{foo: "baz", name: "francoise"}')

  var expected = [
    {foo: "bar", name: "josie"},
    {foo: "baz", name: "eloise"},
    {foo: "baz", name: "francoise"}
  ]

  importStream.end(function () {
    var feedId = importStream.id.toString('hex')
    var rs = jawn.core.createReadStream(feedId)
    rs.on('data', function (block) {
      t.same(block.toString(), expected.shift(), 'block matches imported line')
    })
    t.same(jawn.core.get(feedId).blocks, 3, 'correct number of blocks returned')
    t.end()
  })
})

Import csv

test('import csv to jawn', function (t) {
  var jawn = freshJawn()
  var importStream = jawn.createImportStream({'format': 'csv'})
  // Imitate the stream that would come from reading sample.csv
  // importStream should parse the CSV correctly, identifying the first row as headers
  // This is the same as doing
  //    var data = fs.createReadStream('./test/sample/sample.csv')
  //    data.pipe(importStream)
  // except the writes are being performed synchronously/inline so we can call importStream.end() after writing the contents into it.
  importStream.write('Type of Experience,Little/No Experience,Some Experience,Very Familiar')
  importStream.write('Writing software in any programming language,1,5,4')
  importStream.write('Frontend Web Development,4,3,3')
  importStream.write('Server-side (“backend”) Web Development,4,4,2')
  importStream.write('"Using Git to track changes and share code (add, commit, push, pull)",2,5,3')

  var expected = [
              {'Type of Experience': 'Writing software in any programming language', 'Little/No Experience': 1, 'Some Experience': 5, 'Very Familiar': 4},
              {'Type of Experience': 'Frontend Web Development', 'Little/No Experience': 4, 'Some Experience': 3, 'Very Familiar': 3},
              {'Type of Experience': 'Server-side (backend) Web Development', 'Little/No Experience': 4, 'Some Experience': 4, 'Very Familiar': 2},
              {'Type of Experience': 'Using Git to track changes and share code (add, commit, push, pull)', 'Little/No Experience': 2, 'Some Experience': 5, 'Very Familiar': 3}
  ]

  importStream.end(function () {
    var feedId = importStream.id.toString('hex')
    var rs = jawn.core.createReadStream(feedId)

    rs.on('data', function (block) {
      t.same(block.toString(), expected.shift(), 'block matches imported line')
    })

    var blocks = jawn.core.get(feedId).blocks

    // If you get 5 blocks instead of 4, importStream probably did not recognize the CSV headers
    t.same(blocks, 4, 'correct number of blocks returned')
    t.end()
  })
})
flyingzumwalt commented 8 years ago

FYI I figured out how to make parse-input-stream work. See the tests in this PR on that github repo https://github.com/karissa/parse-input-stream/pull/2