Open StephenOTT opened 7 years ago
Screenshot captures
http://open.canada.ca/data/en/dataset/d4a716f5-a2fc-4c3c-88ed-451fe05900e4
See comments that outline the US to Canada wait times are coming from the US controlled website.
CSV parsing: http://papaparse.com
2017-11-08 17:48 AST
. where the AST
is not commonly used by parsing libraries because it is ambiguous (is AST stand for Atlantic or Alaska?)Used the following regex and JS to clean the CSV:
load('https://cdnjs.cloudflare.com/ajax/libs/PapaParse/4.3.6/papaparse.min.js');
function removeDoubleSemicolons(csvString){
var regex = /(;;[ ]?)/g;
var subst = ';';
var data = csvString.replace(regex, subst);
return data;
}
function removeLineEndingSemicolons(csvString){
var regex = /;$/gm;
var subst = '';
var data = csvString.replace(regex, subst);
return data;
}
var csvString = response;
var pass1 = removeDoubleSemicolons(csvString);
var pass2 = removeLineEndingSemicolons(pass1);
var json = Papa.parse(pass2, {
"header": true,
"delimiter": ";",
"skipEmptyLines": true
});
connector.setVariable('borderWaitTimes', S(JSON.stringify(json.data)));
S(JSON.stringify(json));
using jSoup:
var html = execution.getVariable('htmlResponse').prop('html-response').value();
with (new JavaImporter(org.jsoup)) {
var htmlJsoup = Jsoup.parse(html);
htmlJsoup.title();
}
v0.1
used Jsoup
function getUrlAsXhtmlString(url)
{
with (new JavaImporter(org.jsoup))
{
var doc = Jsoup.connect(url).get();
doc.outputSettings().syntax(Java.type("org.jsoup.nodes.Document.OutputSettings.Syntax").xml);
var docString = doc.toString();
return docString;
}
}
function generateSpinVariables(xHtmlString)
{
var htmlSpin = S(docString);
execution.setVariable('html', htmlSpin);
}
function scrape(url)
{
var xHtmlString = getUrlAsXhtmlString(url);
generateSpinVariables(xHtmlString);
}
scrape('http://www2.nrcan-rncan.gc.ca/dc-dpm/index.cfm?fuseaction=r.q&lang=eng');
Special note:
doc.outputSettings().syntax(Java.type("org.jsoup.nodes.Document.OutputSettings.Syntax").xml);
This required a java enum which required a Forcing of a Type: Java.type("org.jsoup.nodes.Document.OutputSettings.Syntax").xml
. Based this off of: https://stackoverflow.com/a/29039163 and https://jsoup.org/apidocs/org/jsoup/nodes/Document.OutputSettings.Syntax.html, and https://stackoverflow.com/a/29087437
To review: https://github.com/camunda/camunda-spin/issues/16#issuecomment-319944327
Xpath query with Camunda SPIN: https://docs.camunda.org/manual/7.7/reference/spin/xml/04-querying-xml/
https://travel.gc.ca/returning/border-times
https://travel.gc.ca/travelling/border-times-us