Perl / perl5

🐪 The Perl programming language
https://dev.perl.org/perl5/
Other
1.85k stars 527 forks source link

/\S/ is not the same as /[\S]/ #1182

Closed p5pRT closed 20 years ago

p5pRT commented 24 years ago

Migrated from rt.perl.org#2152 (status was 'resolved')

Searchable as RT2152$

p5pRT commented 24 years ago

From rick@consumercontact.com

This is a bug report for perl from rick@​consumercontact.com\, generated with the help of perlbug 1.26 running under perl 5.00503.


/[\S]/ is matching more stuff than /\S/

I get the same results with 5.003 on DGUX\, 5.004_03 on Linux\, 5.005_03 on Windows and 5.5.650 on this platform.

$ perl -e '@​a=map chr\,0..255;die if grep(/\S/\,@​a)!=grep(/[^\s]/\,@​a)' Died at -e line 1. $ perl -e '@​a=map chr\,0..255;die if grep(/\S/\,@​a)!=grep(/[\S]/\,@​a)' Died at -e line 1. $ perl -e '@​a=map chr\,0..255;die if grep(/\s/\,@​a)!=grep(/[\s]/\,@​a)' $ perl -e '@​a=map chr\,0..255;die if grep(/\s/\,@​a)!=grep(/[^\S]/\,@​a)'

These are the characters that don't match \S and [\S] on this system.

$ perl -le 'for (0..255) {print unless chr($_) =~ /\S/}' 0 9 10 11 12 13 32 $ perl -le 'for (0..255) {print unless chr($_) =~ /[\S]/}' 9 10 12 13 32



Site configuration information for perl 5.00503​:

Configured by rick at Mon Jul 26 15​:51​:49 EDT 1999.

Summary of my perl5 (5.0 patchlevel 5 subversion 3) configuration​:   Platform​:   osname=svr4\, osvers=\, archname=i386-svr4   uname='unix_sv consumer 4.2mp 2.1.3 i386 x86at '   hint=recommended\, useposix=true\, d_sigaction=define   usethreads=undef useperlio=undef d_sfio=undef   Compiler​:   cc='/bin/cc'\, optimize='-O'\, gccversion=   cppflags='-I/usr/include -I/usr/ucbinclude -I/usr/local/include'   ccflags ='-I/usr/include -I/usr/ucbinclude -I/usr/local/include'   stdchar='unsigned char'\, d_stdstdio=define\, usevfork=false   intsize=4\, longsize=4\, ptrsize=4\, doublesize=8   d_longlong=undef\, longlongsize=\, d_longdbl=define\, longdblsize=12   alignbytes=4\, usemymalloc=y\, prototype=define   Linker and Libraries​:   ld='/bin/cc'\, ldflags ='-L/usr/ccs/lib -L/usr/ucblib -L/usr/local/lib -L/usr/gnu/lib'   libpth=/usr/local/lib /usr/gnu/lib /shlib /lib /usr/lib /usr/ccs/lib /usr/ucblib   libs=-lsocket -lnsl -ldbm -ldl -lld -lm -lc -lcrypt -lucb   libc=\, so=so\, useshrplib=true\, libperl=libperl.so   Dynamic Linking​:   dlsrc=dl_dlopen.xs\, dlext=so\, d_dlsymun=undef\, ccdlflags=' '   cccdlflags='-KPIC'\, lddlflags='-G -L/usr/ccs/lib -L/usr/ucblib -L/usr/local/lib -L/usr/gnu/lib'

Locally applied patches​:  


@​INC for perl 5.00503​:   /usr/local/lib/perl5/5.00503/i386-svr4   /usr/local/lib/perl5/5.00503   /usr/local/lib/perl5/site_perl/5.005/i386-svr4   /usr/local/lib/perl5/site_perl/5.005   .


Environment for perl 5.00503​:   HOME=/home1/rick   LANG=C   LANGUAGE (unset)   LD_LIBRARY_PATH=/usr/opt/dash/lib​:.​:/usr/lib/ARCserve   LOGDIR (unset)   PATH=/usr/opt/dash/ccl_custom​:/usr/opt/dash​:/usr/opt/vsifax/obin​:/data1/ubl​:/usr/local/bin​:/usr/local/shbin​:/usr/bin​:/usr/ccs/bin​:/usr/ucb​:/usr/opt/vsifax/bin​:/usr/lib/ARCserve​:/usr/sbin​:/usr/X/bin​:/data1/time/PROG​:/home1/rick/bin​:/opt/bin​:.   PERL_BADLANG (unset)   SHELL=/usr/bin/ksh

p5pRT commented 24 years ago

From [Unknown Contact. See original ticket]

It looks like /\D/ isn't the same as /[\D]/ in 5.5.650 either (it is in 5.005_03).

I have probably naively went too far on this but it seems to work. It looked to me like the code for DIGIT and SPACE should just be symmetric with that for ALNUM so I made it that way.

I don't know how to test the UTF8 stuff.

Rick

*** perl5.5.650/regexec.c.old Thu Feb 17 09​:28​:07 2000 --- perl5.5.650/regexec.c Thu Feb 17 13​:36​:34 2000 *************** *** 2084\,2090 ****   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case SPACE​: ! if (!nextchr && locinput >= PL_regeol)   sayNO;   if (!(OP(scan) == SPACE   ? isSPACE(nextchr) : isSPACE_LC(nextchr))) --- 2084\,2090 ----   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case SPACE​: ! if (!nextchr)   sayNO;   if (!(OP(scan) == SPACE   ? isSPACE(nextchr) : isSPACE_LC(nextchr))) *************** *** 2095\,2105 ****   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case SPACEUTF8​: ! if (!nextchr && locinput >= PL_regeol)   sayNO;   if (nextchr & 0x80) {   if (!(OP(scan) == SPACEUTF8 ! ? swash_fetch(PL_utf8_space\,(U8*)locinput)   : isSPACE_LC_utf8((U8*)locinput)))   {   sayNO; --- 2095\,2105 ----   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case SPACEUTF8​: ! if (!nextchr)   sayNO;   if (nextchr & 0x80) {   if (!(OP(scan) == SPACEUTF8 ! ? swash_fetch(PL_utf8_space\, (U8*)locinput)   : isSPACE_LC_utf8((U8*)locinput)))   {   sayNO; *************** *** 2117\,2125 ****   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case NSPACE​: ! if (!nextchr)   sayNO; ! if (OP(scan) == SPACE   ? isSPACE(nextchr) : isSPACE_LC(nextchr))   sayNO;   nextchr = UCHARAT(++locinput); --- 2117\,2125 ----   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case NSPACE​: ! if (!nextchr && locinput >= PL_regeol)   sayNO; ! if (OP(scan) == NSPACE   ? isSPACE(nextchr) : isSPACE_LC(nextchr))   sayNO;   nextchr = UCHARAT(++locinput); *************** *** 2128\,2138 ****   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case NSPACEUTF8​: ! if (!nextchr)   sayNO;   if (nextchr & 0x80) {   if (OP(scan) == NSPACEUTF8 ! ? swash_fetch(PL_utf8_space\,(U8*)locinput)   : isSPACE_LC_utf8((U8*)locinput))   {   sayNO; --- 2128\,2138 ----   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case NSPACEUTF8​: ! if (!nextchr && locinput >= PL_regeol)   sayNO;   if (nextchr & 0x80) {   if (OP(scan) == NSPACEUTF8 ! ? swash_fetch(PL_utf8_space\, (U8*)locinput)   : isSPACE_LC_utf8((U8*)locinput))   {   sayNO; *************** *** 2150\,2156 ****   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case DIGIT​: ! if (!nextchr && locinput >= PL_regeol)   sayNO;   if (!(OP(scan) == DIGIT   ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) --- 2150\,2156 ----   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case DIGIT​: ! if (!nextchr)   sayNO;   if (!(OP(scan) == DIGIT   ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))) *************** *** 2164\,2172 ****   if (!nextchr)   sayNO;   if (nextchr & 0x80) { ! if (OP(scan) == NDIGITUTF8 ! ? swash_fetch(PL_utf8_digit\,(U8*)locinput) ! : isDIGIT_LC_utf8((U8*)locinput))   {   sayNO;   } --- 2164\,2172 ----   if (!nextchr)   sayNO;   if (nextchr & 0x80) { ! if (!(OP(scan) == DIGITUTF8 ! ? swash_fetch(PL_utf8_digit\, (U8*)locinput) ! : isDIGIT_LC_utf8((U8*)locinput)))   {   sayNO;   } *************** *** 2174\,2180 ****   nextchr = UCHARAT(locinput);   break;   } ! if (!isDIGIT(nextchr))   sayNO;   nextchr = UCHARAT(++locinput);   break; --- 2174\,2181 ----   nextchr = UCHARAT(locinput);   break;   } ! if (!(OP(scan) == DIGITUTF8 ! ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))   sayNO;   nextchr = UCHARAT(++locinput);   break; *************** *** 2182\,2190 ****   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case NDIGIT​: ! if (!nextchr)   sayNO; ! if (OP(scan) == DIGIT   ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))   sayNO;   nextchr = UCHARAT(++locinput); --- 2183\,2191 ----   PL_reg_flags |= RF_tainted;   /* FALL THROUGH */   case NDIGIT​: ! if (!nextchr && locinput >= PL_regeol)   sayNO; ! if (OP(scan) == NDIGIT   ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))   sayNO;   nextchr = UCHARAT(++locinput); *************** *** 2196\,2208 ****   if (!nextchr && locinput >= PL_regeol)   sayNO;   if (nextchr & 0x80) { ! if (swash_fetch(PL_utf8_digit\,(U8*)locinput))   sayNO;   locinput += PL_utf8skip[nextchr];   nextchr = UCHARAT(locinput);   break;   } ! if (isDIGIT(nextchr))   sayNO;   nextchr = UCHARAT(++locinput);   break; --- 2197\,2214 ----   if (!nextchr && locinput >= PL_regeol)   sayNO;   if (nextchr & 0x80) { ! if (OP(scan) == NDIGITUTF8 ! ? swash_fetch(PL_utf8_digit\, (U8*)locinput) ! : isDIGIT_LC_utf8((U8*)locinput)) ! {   sayNO; + }   locinput += PL_utf8skip[nextchr];   nextchr = UCHARAT(locinput);   break;   } ! if (OP(scan) == NDIGITUTF8 ! ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))   sayNO;   nextchr = UCHARAT(++locinput);   break;