Closed bartek5186 closed 2 years ago
I am not able to reproduce this, using the example you posted in the linked issue:
[
Document {
name: { default: 'Grudziądz' },
phrase: { default: 'Grudziądz' },
parent: {},
address_parts: { zip: '86-300' },
center_point: { lon: 18.76077, lat: 53.468958 },
category: [],
addendum: {},
source: 'bdp',
layer: 'postalcode',
source_id: '71ff447b-972b-4f7d-a8c1-e0c8c02a1a19',
popularity: 100
}
]
bdp:postalcode:71ff447b-972b-4f7d-a8c1-e0c8c02a1a19
It may be something to do with how your CSV is encoded, can you please upload a copy of the file you're using (either the whole thing or at least with the header line and first line).
agh sorry, you're saying this is only triggered when using multiple files... let me check that
agh sorry, you're saying this is only triggered when using multiple files... let me check that
I can send 2 files if it will not work on normal "multiple" case.
I'm not able to reproduce this, please send me the files you're using:
cat data/example2.csv
source,popularity,layer,id,lat,lon,name,postalcode,country,name_jso
bdp,100,postalcode,71ff447b-972b-4f7d-a8c1-e0c8c02a1a19,53.468958363988,18.760770296251,Grudziądz,86-300,PL,"[""86-300"", "" 86-301"", "" 86-302"", "" 86-303"", "" 86-304"", "" 86-305"", "" 86-306"", "" 86-307"", "" 86-308"", "" 86-309"", "" 86-310"", "" 86-311""]"
diff --git a/test/streams/recordStream.js b/test/streams/recordStream.js
index 1fdd378..3be6427 100644
--- a/test/streams/recordStream.js
+++ b/test/streams/recordStream.js
@@ -144,6 +144,45 @@ tape(
}
);
+const path = require('path');
+
+/**
+ * Test the full recordStream pipeline from input CSV to output pelias-model
+ *
+ * issue: https://github.com/pelias/csv-importer/issues/89
+ */
+tape(
+ 'import multiple CSV files and test the results.',
+ function (test) {
+ const fixtures = [
+ path.resolve(`${__dirname}/../../data/example.csv`),
+ path.resolve(`${__dirname}/../../data/example2.csv`),
+ ]
+ const dataStream = recordStream.create(fixtures);
+ test.ok(dataStream.readable, 'Stream is readable.');
+
+ // read documents into array
+ const docs = []
+ const testStream = through.obj((doc, enc, next) => {
+ docs.push(doc);
+ next();
+ });
+
+ // test assertions
+ testStream.on('finish', () => {
+ test.ok(docs.length, 'total of 4 valid documents');
+ test.ok(docs[0].getGid() === 'pelias:example_layer:1');
+ test.ok(docs[1].getGid() === 'pelias:address:2');
+ test.ok(docs[2].getGid() === 'pelias:with_custom_data:4');
+ test.ok(docs[3].getGid() === 'bdp:postalcode:71ff447b-972b-4f7d-a8c1-e0c8c02a1a19');
+ test.end();
+ });
+
+ // run
+ dataStream.pipe(testStream);
+ }
+);
+
tape( 'getIdPrefix returns prefix based on irectory structure', function( test ) {
var filename = '/base/path/us/ca/san_francisco.csv';
var basePath = '/base/path';
bdp_pl.csv bdp_de.csv bdp_uk.csv
config i have setup like that: and API
Hmmm.. okay so I don't think this is to do with having multiple files.
I'm about to finish for the day but It appears that the issue is with three leading bytes being present in the files you provided.
My suspicion is that these are a Byte Order Mark.
A BOM is not required for utf-8
encoding, so I'm not sure why it's there.
We should be able to detect these and ignore them, although if your data is in utf-16
then converting it should also remove the BOM
head -n1 data/bdp_de.csv | xxd -b
00000000: 11101111 10111011 10111111 01110011 01101111 01110101 ...sou
00000006: 01110010 01100011 01100101 00101100 01110000 01101111 rce,po
0000000c: 01110000 01110101 01101100 01100001 01110010 01101001 pulari
00000012: 01110100 01111001 00101100 01101100 01100001 01111001 ty,lay
00000018: 01100101 01110010 00101100 01101001 01100100 00101100 er,id,
0000001e: 01101100 01100001 01110100 00101100 01101100 01101111 lat,lo
00000024: 01101110 00101100 01101110 01100001 01101101 01100101 n,name
0000002a: 00101100 01110000 01101111 01110011 01110100 01100001 ,posta
00000030: 01101100 01100011 01101111 01100100 01100101 00101100 lcode,
00000036: 01100011 01101111 01110101 01101110 01110100 01110010 countr
0000003c: 01111001 00101100 01101110 01100001 01101101 01100101 y,name
00000042: 01011111 01101010 01110011 01101111 01101110 00001101 _json.
00000048: 00001010
head -n1 data/bdp_de.csv | xxd
00000000: efbb bf73 6f75 7263 652c 706f 7075 6c61 ...source,popula
00000010: 7269 7479 2c6c 6179 6572 2c69 642c 6c61 rity,layer,id,la
00000020: 742c 6c6f 6e2c 6e61 6d65 2c70 6f73 7461 t,lon,name,posta
00000030: 6c63 6f64 652c 636f 756e 7472 792c 6e61 lcode,country,na
00000040: 6d65 5f6a 736f 6e0d 0a me_json.. .
From the wiki:
The UTF-8 representation of the BOM is the (hexadecimal) byte sequence 0xEF,0xBB,0xBF.
So yeah, thats exactly what it is 11101111 10111011 10111111
Yes, on server i have too this
proposed fix https://github.com/pelias/csv-importer/pull/93
If import multiple csv files, the source name automatically changed to csv (note: in files source column is properly named to custom).
source has name csv in imported data
Problem doesnt exist if i use one csv file.
Example: