Segfault when there is a combination of newlines and none

It appears that an input string with at least 15 new-lines which is then modified to have no trailing new-line causes the tokenizer to dump core.

use Compiler::Lexer; my $string = "\n" x 15; $string = "#"; Compiler::Lexer->new->tokenize($string);

Outputs:

* glibc detected * perl: free(): invalid next size (fast): 0x0000000013a82290 *** ======= Backtrace: ========= /lib64/libc.so.6[0x2b7ad4b1e4af] /lib64/libc.so.6(cfree+0x4b)[0x2b7ad4b227ab] <...>lib/perl5/x86_64-linux-thread-multi/auto/Compiler/Lexer/Lexer.so(_ZN5Lexer12clearContextEv+0x29)[0x2b7ad88094a9] <...>lib/perl5/x86_64-linux-thread-multi/auto/Compiler/Lexer/Lexer.so[0x2b7ad8804645] perl(Perl_pp_entersub+0x58f)[0x48900f] perl(Perl_runops_standard+0xe)[0x4875fe] perl(perl_run+0x305)[0x431665] perl(main+0xcd)[0x41d46d] /lib64/libc.so.6(__libc_start_main+0xf4)[0x2b7ad4aca9f4] perl(sin+0xb1)[0x41d2e9]

My setup:

Summary of my perl5 (revision 5 version 10 subversion 1) configuration: Platform: osname=linux, osvers=2.6.18-400.1.1.el5, archname=x86_64-linux-thread-multi uname='linux 2.6.18-400.1.1.el5 #1 smp thu dec 18 00:59:53 est 2014 x86_64 x86_64 x86_64 gnulinux ' config_args='-Dprefix=.plenv/versions/5.10.1 -de -Dusedevel -Dusethreads -A'eval:scriptdir=.plenv/versions/5.10.1/bin'' hint=recommended, useposix=true, d_sigaction=define useithreads=define, usemultiplicity=define useperlio=define, d_sfio=undef, uselargefiles=define, usesocks=undef use64bitint=define, use64bitall=define, uselongdouble=undef usemymalloc=n, bincompat5005=undef Compiler: cc='cc', ccflags ='-D_REENTRANT -D_GNU_SOURCE -fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64', optimize='-O2', cppflags='-D_REENTRANT -D_GNU_SOURCE -fno-strict-aliasing -pipe -fstack-protector -I/usr/local/include' ccversion='', gccversion='4.1.2 20080704 (Red Hat 4.1.2-54)', gccosandvers='' intsize=4, longsize=8, ptrsize=8, doublesize=8, byteorder=12345678 d_longlong=define, longlongsize=8, d_longdbl=define, longdblsize=16 ivtype='long', ivsize=8, nvtype='double', nvsize=8, Off_t='off_t', lseeksize=8 alignbytes=8, prototype=define Linker and Libraries: ld='cc', ldflags =' -fstack-protector -L/usr/local/lib' libpth=/usr/local/lib /lib /usr/lib /lib64 /usr/lib64 /usr/local/lib64 libs=-lnsl -ldb -ldl -lm -lcrypt -lutil -lpthread -lc perllibs=-lnsl -ldl -lm -lcrypt -lutil -lpthread -lc libc=libc-2.5.so, so=so, useshrplib=false, libperl=libperl.a gnulibc_version='2.5' Dynamic Linking: dlsrc=dl_dlopen.xs, dlext=so, d_dlsymun=undef, ccdlflags='-Wl,-E' cccdlflags='-fPIC', lddlflags='-shared -O2 -L/usr/local/lib -fstack-protector' Characteristics of this binary (from libperl): Compile-time options: MULTIPLICITY PERL_DONT_CREATE_GVSV PERL_IMPLICIT_CONTEXT PERL_MALLOC_WRAP PERL_USE_DEVEL USE_64_BIT_ALL USE_64_BIT_INT USE_ITHREADS USE_LARGE_FILES USE_PERLIO USE_REENTRANT_API Built under linux Compiled at Mar 4 2015 13:42:57

Buffer overflow is occurred at LexContext::clearBuffer

I applied following patch for checking buffer overflow and rebuild Compiler::Lexer.

diff --git a/include/lexer.hpp b/include/lexer.hpp
index e0efdc1..ffe9796 100644
--- a/include/lexer.hpp
+++ b/include/lexer.hpp
@@ -128,6 +128,8 @@ public:
    }
 };

+extern size_t debug_buffer_size;
+
 class LexContext {
 public:
    ScriptManager *smgr;
@@ -153,6 +155,10 @@ public:
        token_buffer[0] = EOL;
        buffer_idx = 0;
        token_buffer++;
+       if (token_buffer > (buffer_head + debug_buffer_size)) {
+           fprintf(stderr, "#### BUFFER OVERFLOW ####\n");
+           exit(1);
+       }
        token_buffer[0] = EOL;
    }

diff --git a/src/compiler/lexer/Compiler_lexer.cpp b/src/compiler/lexer/Compiler_lexer.cpp
index 96c195d..f745b1f 100644
--- a/src/compiler/lexer/Compiler_lexer.cpp
+++ b/src/compiler/lexer/Compiler_lexer.cpp
@@ -10,12 +10,13 @@ namespace TokenKind = Enum::Token::Kind;

 Module::Module(const char *name_, const char *args_)
    : name(name_), args(args_) {}
-
+size_t debug_buffer_size;
 LexContext::LexContext(const char *filename, char *script)
    : progress(0), buffer_idx(0)
 {
    script_size = strlen(script) + 1;
    token_buffer = (char *)malloc((script_size + EXTEND_BUFFER_SIZE) * 2);
+   debug_buffer_size = ((script_size + EXTEND_BUFFER_SIZE) * 2);
    buffer_head = token_buffer;
    token_buffer[0] = EOL;
    prev_type = TokenType::Undefined;

Then run the code.

% cat test.pl
use Compiler::Lexer;
my $string = "\n" x 15;
$string = "#";
Compiler::Lexer->new->tokenize($string);
% perl test.pl
#### BUFFER OVERFLOW ####

goccy / p5-Compiler-Lexer

Segfault when there is a combination of newlines and none #59