Raku / nqp

NQP
Other
336 stars 131 forks source link

Don't collapse a string's strands when matching against it #803

Open MasterDuke17 opened 1 year ago

MasterDuke17 commented 1 year ago

This used to be an optimization, but it seems like it isn't needed anymore.

Two test cases don't show any significant change in runtime, and only a very small reduction in memory use. However, maybe these test cases aren't actually that great?

/usr/bin/time raku -e 'srand 1; my $a = ("a".."z", "«", "»", 0..9).flat.roll(100_000).join; my $b; my $s = now; $b = $a ~~ /\d\w+\d/ for ^100_000; say now - $s; say $b' and /usr/bin/time raku -e 'my $page = ("abc" ~ "d" xx 70) x 70; my $a; my $s = now; $a = $page ~~ m/ a (.+) c / for ^100; say now - $s; say $a.chars;'

Currently this causes an Iteration past end of grapheme iterator exception in both the NQP tests and the Rakudo tests. But I'm not sure if it's just uncovering an existing bug, or collapsing the strand really is required.

dan@hermes:~/Source/raku/nqp$ gdb --args ./nqp-m t/p5regex/01-p5regex.t
GNU gdb (Ubuntu 13.1-2ubuntu2) 13.1
<...>
(gdb) b MVM_exception_throw_adhoc
Function "MVM_exception_throw_adhoc" not defined.
Make breakpoint pending on future shared library load? (y or [n]) y
Breakpoint 1 (MVM_exception_throw_adhoc) pending.
(gdb) r
Starting program: /home/dan/Source/raku/nqp/nqp-m t/p5regex/01-p5regex.t
<...>
[New Thread 0x7ffff6fff6c0 (LWP 26738)]
# file: rx_basic
ok 1 - basic literal
<...>
ok 175 - ignorecase (:i)

Thread 1 "moar" hit Breakpoint 1, MVM_exception_throw_adhoc (tc=0x20000020180, messageFormat=0x7ffff7884968 "Iteration past end of grapheme iterator") at src/core/exceptions.c:882
882     MVM_NO_RETURN void MVM_exception_throw_adhoc(MVMThreadContext *tc, const char *messageFormat, ...) {
(gdb) bt
#0  MVM_exception_throw_adhoc (tc=0x20000020180, messageFormat=0x7ffff7884968 "Iteration past end of grapheme iterator") at src/core/exceptions.c:882
#1  0x00007ffff7788cd8 in MVM_string_gi_get_grapheme (tc=0x20000020180, gi=0x7fffffffbd10) at src/strings/iter.h:184
#2  0x00007ffff778915f in MVM_string_gi_cached_init (tc=0x20000020180, gic=0x7fffffffbd10, s=0x20000031c68, index=6) at src/strings/iter.h:358
#3  0x00007ffff778dba2 in string_equal_at_ignore_case (tc=0x20000020180, Haystack=0x20000031c68, needle=0x20000ea2af0, H_offset=6, ignoremark=0, ignorecase=1) at src/strings/ops.c:1273
#4  0x00007ffff778e470 in MVM_string_equal_at_ignore_case (tc=0x20000020180, Haystack=0x20000031c68, needle=0x20000ea2af0, H_offset=6) at src/strings/ops.c:1412
#5  0x00007ffff7624855 in MVM_interp_run (tc=0x20000020180, initial_invoke=0x7ffff77f1530 <toplevel_initial_invoke>, invoke_data=0x200006e8c00, outer_runloop=0x0) at src/core/interp.c:1429
#6  0x00007ffff77f1696 in MVM_vm_run_file (instance=0x20000010000, filename=0x7fffffffdf91 "nqp.moarvm") at src/moar.c:505
#7  0x0000555555555cdf in main (argc=5, argv=0x7fffffffdb38) at src/main.c:307
(gdb) f 3
#3  0x00007ffff778dba2 in string_equal_at_ignore_case (tc=0x20000020180, Haystack=0x20000031c68, needle=0x20000ea2af0, H_offset=6, ignoremark=0, ignorecase=1) at src/strings/ops.c:1273
1273            MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
(gdb) p Haystack
$1 = (MVMString *) 0x20000031c68
(gdb) p *Haystack
$2 = {common = {header = {sc_forward_u = {forwarder = 0x0, sc = {sc_idx = 0, idx = 0}, st = 0x0}, owner = 1, flags1 = 0 '\000', flags2 = 1 '\001', size = 48}, st = 0x20000740000}, body = {storage = {
      blob_32 = 0x200016ad480, blob_ascii = 0x200016ad480 "\310\025\003", blob_8 = 0x200016ad480 "\310\025\003", strands = 0x200016ad480, any = 0x200016ad480}, storage_type = 3, num_strands = 1, 
    num_graphs = 6, cached_hash_code = 0}}
(gdb) p *needle
$3 = {common = {header = {sc_forward_u = {forwarder = 0x0, sc = {sc_idx = 0, idx = 0}, st = 0x0}, owner = 1, flags1 = 0 '\000', flags2 = 2 '\002', size = 48}, st = 0x20000740000}, body = {storage = {
      blob_32 = 0x20000d31840, blob_ascii = 0x20000d31840 "bcd\336\336\336\336\336q,Id\005", blob_8 = 0x20000d31840 "bcd\336\336\336\336\336q,Id\005", strands = 0x20000d31840, any = 0x20000d31840}, 
    storage_type = 2, num_strands = 0, num_graphs = 3, cached_hash_code = 0}}
(gdb) f 5
#5  0x00007ffff7624855 in MVM_interp_run (tc=0x20000020180, initial_invoke=0x7ffff77f1530 <toplevel_initial_invoke>, invoke_data=0x200006e8c00, outer_runloop=0x0) at src/core/interp.c:1429
1429                    GET_REG(cur_op, 0).i64 = MVM_string_equal_at_ignore_case(tc,
(gdb) list
1424                        GET_REG(cur_op, 2).s, GET_REG(cur_op, 4).s,
1425                        GET_REG(cur_op, 6).i64);
1426                    cur_op += 8;
1427                    goto NEXT;
1428                OP(eqatic_s):
1429                    GET_REG(cur_op, 0).i64 = MVM_string_equal_at_ignore_case(tc,
1430                        GET_REG(cur_op, 2).s, GET_REG(cur_op, 4).s,
1431                        GET_REG(cur_op, 6).i64);
1432                    cur_op += 8;
1433                    goto NEXT;
(gdb) f 4
#4  0x00007ffff778e470 in MVM_string_equal_at_ignore_case (tc=0x20000020180, Haystack=0x20000031c68, needle=0x20000ea2af0, H_offset=6) at src/strings/ops.c:1412
1412        return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 0, 1);
(gdb) p *Haystack
$4 = {common = {header = {sc_forward_u = {forwarder = 0x0, sc = {sc_idx = 0, idx = 0}, st = 0x0}, owner = 1, flags1 = 0 '\000', flags2 = 1 '\001', size = 48}, st = 0x20000740000}, body = {storage = {
      blob_32 = 0x200016ad480, blob_ascii = 0x200016ad480 "\310\025\003", blob_8 = 0x200016ad480 "\310\025\003", strands = 0x200016ad480, any = 0x200016ad480}, storage_type = 3, num_strands = 1, 
    num_graphs = 6, cached_hash_code = 0}}
(gdb) p *needle
$5 = {common = {header = {sc_forward_u = {forwarder = 0x0, sc = {sc_idx = 0, idx = 0}, st = 0x0}, owner = 1, flags1 = 0 '\000', flags2 = 2 '\002', size = 48}, st = 0x20000740000}, body = {storage = {
      blob_32 = 0x20000d31840, blob_ascii = 0x20000d31840 "bcd\336\336\336\336\336q,Id\005", blob_8 = 0x20000d31840 "bcd\336\336\336\336\336q,Id\005", strands = 0x20000d31840, any = 0x20000d31840}, 
    storage_type = 2, num_strands = 0, num_graphs = 3, cached_hash_code = 0}}
(gdb) list
1407        }
1408        return -1;
1409    }
1410
1411    MVMint64 MVM_string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
1412        return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 0, 1);
1413    }
1414    MVMint64 MVM_string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
1415        return string_index_ignore_case(tc, Haystack, needle, start, 0, 1);
1416    }
(gdb) f 3
#3  0x00007ffff778dba2 in string_equal_at_ignore_case (tc=0x20000020180, Haystack=0x20000031c68, needle=0x20000ea2af0, H_offset=6, ignoremark=0, ignorecase=1) at src/strings/ops.c:1273
1273            MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
(gdb) list
1268            needle_fc = ignorecase ? MVM_string_fc(tc, needle) : needle;
1269        });
1270        n_fc_graphs = MVM_string_graphs(tc, needle_fc);
1271        if (Haystack->body.storage_type == MVM_STRING_STRAND) {
1272            MVMGraphemeIter_cached H_gic;
1273            MVM_string_gi_cached_init(tc, &H_gic, Haystack, H_offset);
1274            H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, &H_gic, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 1);
1275        }
1276        else {
1277            H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Haystack, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, ignorecase, 0);
(gdb) 

@jnthn ping

MasterDuke17 commented 1 year ago

A raku -e '"foo" ~~ s:g:i/O/x/' is a shorter golf.

MasterDuke17 commented 1 year ago

Even shorter: raku -e '"o" ~~ m:g:i/O/'

MasterDuke17 commented 1 year ago

FWIW, changing the nqp::indexingoptimized to an nqp::decont or nqp::clone (or either combination of both of them) instead of removing it doesn't fix the error.