makamaka / Text-CSV

comma-separated values manipulator
24 stars 19 forks source link

Double headers when both detect_bom and headers are provided #63

Closed polettix closed 1 year ago

polettix commented 1 year ago

Hi!

Using the module for a command-line tool, I'm consistently setting headers => 'auto' and give the possibility to specify a detect_bom based on a command-line option.

Tests show that when both options are active, headers are read twice, which ends up skipping the first line (where real headers live) and trying to set headers from the second line.

I'd say that this behaviour is surprising and sub-optimal, although admittedly there's no indication that this should not behave this way in the docs. I'm currently working this around like this:

my $has_bom = ...;
my @csv_args = (
   ($has_bom ? (detect_bom => 1) : (headers => 'auto')),
    ...
);

If you're interested, this is a full test for this:

#!/usr/bin/env perl
use strict;
use warnings;

use Test::More;
use Encode qw< decode encode >;
use Text::CSV_PP 'csv';
use Data::Dumper;

ok 1, 'tests start';

my $input = input();
my $input2 = input2();

subtest 'plain array' => sub {
   open my $fh, '<', \$input or die "open(): $!";
   my $parsed = csv(in => $fh, sep_char => ',');
   isa_ok $parsed, 'ARRAY';

   my $n_read = @$parsed;
   is $n_read, 3, 'lines read';
   is length($parsed->[0][0]), 4, 'length of first parsed thing';
};

subtest 'with headers => auto' => sub {
   open my $fh, '<', \$input or die "open(): $!";
   my $parsed = csv(in => $fh, sep_char => ',', headers => 'auto');
   isa_ok $parsed, 'ARRAY';

   my $n_read = @$parsed;
   is $n_read, 2, 'records read';

   my $record = $parsed->[0];
   isa_ok $record, 'HASH';
   my $key;
   for my $k (keys %{$record}) {
      next if $record->{$k};
      $key = $k;
      last;
   }
   is length($key), 4, 'length of "first" key';
};

subtest 'with detect_bom => 1' => sub {
   open my $fh, '<', \$input or die "open(): $!";
   my $parsed = csv(in => $fh, sep_char => ',', detect_bom => 1);
   isa_ok $parsed, 'ARRAY';

   my $n_read = @$parsed;
   is $n_read, 2, 'records read';

   my $record = $parsed->[0];
   isa_ok $record, 'HASH';
   my $key;
   for my $k (keys %{$record}) {
      next if $record->{$k};
      $key = $k;
      last;
   }
   is length($key), 3, 'length of "first" key (stripped!)';
};

subtest 'input with detect_bom => 1 and headers => auto' => sub {
   open my $fh, '<', \$input or die "open(): $!";

   my $parsed = eval {
      csv(in => $fh, sep_char => ',', detect_bom => 1, headers => 'auto');
   };
   if ($parsed) {
      isa_ok $parsed, 'ARRAY';

      my $n_read = @$parsed;
      is $n_read, 2, 'records read';

      my $record = $parsed->[0];
      isa_ok $record, 'HASH';
      my $key;
      for my $k (keys %{$record}) {
         next if $record->{$k};
         $key = $k;
         last;
      }
      is length($key), 3, 'length of "first" key (stripped!)';
   }
   else {
      fail "could not parse: $@";
   }
};

subtest 'input2 with detect_bom => 1 and headers => auto' => sub {
   open my $fh, '<', \$input2 or die "open(): $!";

   my $parsed = eval {
      csv(in => $fh, sep_char => ',', detect_bom => 1, headers => 'auto');
   };
   if ($parsed) {
      isa_ok $parsed, 'ARRAY';
      my $n_read = @$parsed;
      is $n_read, 2, 'records read' or diag "read $n_read";

      my $record = $parsed->[0];
      isa_ok $record, 'HASH';
      my $key;
      for my $k (keys %{$record}) {
         next if $record->{$k};
         $key = $k;
         last;
      }
      is length($key), 3, 'length of "first" key (stripped!)'
         or diag "key<$key>";
   }
   else {
      fail "could not parse: $@";
   }
};

done_testing();

sub input {
   my $string = <<"END";
\x{FEFF}foo,bar
,12
0,11
END
   return encode('UTF-8', $string);
}

sub input2 {
   my $string = <<"END";
\x{FEFF}foo,bar
NOMNOM,da-bah
0,11
END
   return encode('UTF-8', $string);
}

Output:

$ perl -MText::CSV_PP -E 'say $Text::CSV_PP::VERSION'
2.02

$ prove -v ./csv_pp-test01.t 
./csv_pp-test01.t .. 
ok 1 - tests start
# Subtest: plain array
    ok 1 - A reference of type 'ARRAY' isa 'ARRAY'
    ok 2 - lines read
    ok 3 - length of first parsed thing
    1..3
ok 2 - plain array
# Subtest: with headers => auto
    ok 1 - A reference of type 'ARRAY' isa 'ARRAY'
    ok 2 - records read
    ok 3 - A reference of type 'HASH' isa 'HASH'
    ok 4 - length of "first" key
    1..4
ok 3 - with headers => auto
# Subtest: with detect_bom => 1
    ok 1 - A reference of type 'ARRAY' isa 'ARRAY'
    ok 2 - records read
    ok 3 - A reference of type 'HASH' isa 'HASH'
    ok 4 - length of "first" key (stripped!)
    1..4
ok 4 - with detect_bom => 1
# Subtest: input with detect_bom => 1 and headers => auto
# CSV_PP ERROR: 1012 - INI - the header contains an empty field @ rec 2 pos 0
    not ok 1 - could not parse: INI - the header contains an empty field
    1..1
not ok 5 - input with detect_bom => 1 and headers => auto
# Subtest: input2 with detect_bom => 1 and headers => auto

    #   Failed test 'could not parse: INI - the header contains an empty field'
    #   at ./csv_pp-test01.t line 86.
    # Looks like you failed 1 test of 1.

#   Failed test 'input with detect_bom => 1 and headers => auto'
#   at ./csv_pp-test01.t line 88.
    ok 1 - A reference of type 'ARRAY' isa 'ARRAY'
    not ok 2 - records read

    #   Failed test 'records read'
    #   at ./csv_pp-test01.t line 99.
    #          got: '1'
    #     expected: '2'
    ok 3 - A reference of type 'HASH' isa 'HASH'
    not ok 4 - length of "first" key (stripped!)
    1..4
not ok 6 - input2 with detect_bom => 1 and headers => auto
1..6
    # read 1

    #   Failed test 'length of "first" key (stripped!)'
    #   at ./csv_pp-test01.t line 109.
    #          got: '6'
    #     expected: '3'
    # key<NOMNOM>
    # Looks like you failed 2 tests of 4.

#   Failed test 'input2 with detect_bom => 1 and headers => auto'
#   at ./csv_pp-test01.t line 115.
# Looks like you failed 2 tests of 6.
Dubious, test returned 2 (wstat 512, 0x200)
Failed 2/6 subtests 

Test Summary Report
-------------------
./csv_pp-test01.t (Wstat: 512 Tests: 6 Failed: 2)
  Failed tests:  5-6
  Non-zero exit status: 2
Files=1, Tests=6,  0 wallclock secs ( 0.00 usr  0.01 sys +  0.05 cusr  0.00 csys =  0.06 CPU)
Result: FAIL
Tux commented 1 year ago

Relatively easy to reproduce (in CSV_XS). This will most likely be the new test-unit:

$ cat t/68_header.t
#!/usr/bin/perl

use strict;
use warnings;

 use Test::More "no_plan";
#use Test::More tests => 24;
 use Config;

BEGIN {
    use_ok "Text::CSV_XS", "csv";
    plan skip_all => "Cannot load Text::CSV_XS" if $@;
    require "./t/util.pl";
    }

my $tfn = "_68test.csv"; END { unlink $tfn, "_$tfn"; }

my @dta = (
    [qw( foo  bar   zap     )],
    [qw( mars venus pluto   )],
    [qw( 1    2     3       )],
    );
my @dth = (
    { foo => "mars", bar => "venus", zap => "pluto" },
    { foo => 1,      bar => 2,       zap => 3       },
    );

{   open my $fh, ">", $tfn or die "$tfn: $!\n";
    local $" = ",";
    print $fh "@$_\n" for @dta;
    close $fh;
    }

is_deeply (csv (in => $tfn),                              \@dta, "csv ()");
is_deeply (csv (in => $tfn, bom => 1),                    \@dth, "csv (bom)");
is_deeply (csv (in => $tfn,           headers => "auto"), \@dth, "csv (headers)");
is_deeply (csv (in => $tfn, bom => 1, headers => "auto"), \@dth, "csv (bom, headers)");

foreach my $arg ("", "bom", "auto", "bom, auto") {
    open my $fh, "<", $tfn or die "$tfn: $!\n";
    my %attr;
    $arg =~ m/bom/  and $attr{bom}     = 1;
    $arg =~ m/auto/ and $attr{headers} = "auto";
    ok (my $csv = Text::CSV_XS->new (), "New $arg");
    is ($csv->record_number, 0, "Start");
    if ($arg) {
    is_deeply ([ $csv->header   ($fh, \%attr) ], $dta[0], "Header") if $arg;
    diag ($csv->record_number);
    is_deeply ($csv->getline_hr ($fh), $dth[$_], "getline $_") for 0..$#dta;
    diag ($csv->record_number);
    }
    else {
    is_deeply ($csv->getline    ($fh), $dta[$_], "getline $_") for 0..$#dta;
    is ($csv->record_number, 3, "Done");
    }
    close $fh;
    }

done_testing;

Still failing, but working on it …

Tux commented 1 year ago

fix for Text::CSV_XS being tested. The next commit makes it PASS all the way back to perl-5.6.1. Some more tests in progress, but likely I'll uload a new release today or tomorrow

charsbar commented 1 year ago

Thanks. Shipped Text::CSV 2.03 with the fixes from Text::CSV_XS 1.51