ScintillaOrg / lexilla

A library of language lexers for use with Scintilla
https://www.scintilla.org/Lexilla.html
Other
187 stars 67 forks source link

[Bash] Improve backslash handling inside backquoted command substitution #194

Closed zufuliu closed 1 year ago

zufuliu commented 1 year ago

Created for https://sourceforge.net/p/scintilla/bugs/2100/, not sure whether worth the complex though. https://www.gnu.org/software/bash/manual/bash.html#Command-Substitution

only backslash followed by $, ", ' and ` need to be special handled, following is what observed from msys2 bash 5.2.15: more: ask more input after > prompt. fail: unexpected EOF while looking for matching quote.

  1. 4N + 2 or 4N + 3 backslashes escapes dollar (bar=A)
count code result
0 echo `foo=$bar; echo $foo` | A
1 echo `foo=\$bar; echo $foo` | A
2 echo `foo=\\$bar; echo $foo` | $bar
3 echo `foo=\\\$bar; echo $foo` | $bar
4 echo `foo=\\\\$bar; echo $foo` | \A
5 echo `foo=\\\\\$bar; echo $foo` | \A
6 echo `foo=\\\\\\$bar; echo $foo` | \$bar
7 echo `foo=\\\\\\\$bar; echo $foo` | \$bar
8 echo `foo=\\\\\\\\$bar; echo $foo` | \\A
9 echo `foo=\\\\\\\\\$bar; echo $foo` | \\A
10 echo `foo=\\\\\\\\\\$bar; echo $foo` | \\$bar
11 echo `foo=\\\\\\\\\\\$bar; echo $foo`| \\$bar
  1. 4N + 1 or 4N + 2 backslashes escapes double and single quote
count code result
0 echo `foo="; echo $foo` fail
1 echo `foo=\"; echo $foo` | ok, "
2 echo `foo=\\"; echo $foo` | ok, "
3 echo `foo=\\\"; echo $foo` fail
4 echo `foo=\\\\"; echo $foo` fail
5 echo `foo=\\\\\"; echo $foo` | ok, \"
6 echo `foo=\\\\\\"; echo $foo` | ok, \"
7 echo `foo=\\\\\\\"; echo $foo` fail
8 echo `foo=\\\\\\\\"; echo $foo` fail
9 echo `foo=\\\\\\\\\"; echo $foo` | ok, \\"
10 echo `foo=\\\\\\\\\\"; echo $foo` | ok, \\"
11 echo `foo=\\\\\\\\\\\"; echo $foo` fail
  1. 4N + 3 backslashes escapes backquote
count code result
0 echo `foo=`; echo $foo` more
1 echo `foo=\`; echo $foo` fail
2 echo `foo=\\`; echo $foo` more
3 echo `foo=\\\`; echo $foo` | ok, `
4 echo `foo=\\\\`; echo $foo` more
5 echo `foo=\\\\\`; echo $foo` fail
6 echo `foo=\\\\\\`; echo $foo` more
7 echo `foo=\\\\\\\`; echo $foo` | ok, \`
8 echo `foo=\\\\\\\\`; echo $foo` more
9 echo `foo=\\\\\\\\\`; echo $foo` fail
10 echo `foo=\\\\\\\\\\`; echo $foo` more
11 echo `foo=\\\\\\\\\\\`; echo $foo` | ok, \\`
zufuliu commented 1 year ago

Python script to run the tests:

import os
import subprocess

def run_test(what, n, prefix=''):
    code = prefix + 'echo `foo=' + '\\'*n + what + '; echo $foo`\n'
    path = str(ord(what[0])) + '-'  + str(n)  + '.sh'
    with open(path, 'wb') as fd:
        fd.write(code.encode('utf-8'))

    proc = subprocess.run(['bash', path], capture_output=True)
    try:
        os.remove(path)
    except:
        pass
    return proc.stdout.decode('utf-8').strip(), proc.stderr.decode('utf-8').strip()

def test_all(max_num):
    result = []
    all_test = [('\'', ''), ('\"', ''), ('`', ''), ('$bar', 'bar=A\n'), ('$((1 + 2))', ''), ('A', '')]
    for (what, prefix) in all_test:
        result.append(what)
        for n in range(max_num):
            stdout, stderr = run_test(what, n, prefix)
            if stderr:
                result.append(str(n) + '\t\tfail: ' + stderr)
            else:
                result.append(str(n) + '\t\tok:' + stdout)
        result.append('\n')

    path = 'result-' + str(max_num) + '.log'
    with open(path, 'wb') as fd:
        fd.write('\n'.join(result).encode('utf-8'))

test_all(50)

result-50.log

zufuliu commented 1 year ago

Updated Python script to generate test for nested backquoted command substitution (above conclusions are not all correct).

import os
import subprocess

def generate_test(level, what, n, preamble=''):
    backslash = '\\'*((1 << level) + n)
    prefix = [preamble, 'echo ']
    suffix = []
    for i in range(level):
        count = (1 << i) - 1
        escape = backslash[:count]
        prefix.append(f'{escape}`echo ')
        suffix.append(f' {escape}`')
    prefix.append(f'{backslash[:n]}{what}')
    prefix.extend(reversed(suffix))
    return ''.join(prefix)

def run_test(level, what, n, preamble=''):
    code = generate_test(level, what, n, preamble)
    path = f'{ord(what[0])}-{level}-{n}.sh'
    with open(path, 'wb') as fd:
        fd.write(code.encode('utf-8'))

    proc = subprocess.run(['bash', path], check=False, capture_output=True)
    try:
        os.remove(path)
    except:
        pass
    return code, proc.stdout, proc.stderr

def merge_output(output):
    return '; '.join(output.decode('utf-8').strip().splitlines())

def test_all(level, max_num):
    result = []
    all_test = [('$bar', 'bar=A; '), ('$((1))', ''), ('A', ''), ('\'', ''), ('\"', ''), ('`', '')]
    for (what, preamble) in all_test:
        result.append(what)
        length = len(generate_test(level, what, max_num - 1, preamble)) + 1
        for n in range(max_num):
            code, stdout, stderr = run_test(level, what, n, preamble)
            suffix = ' '*(length - len(code))
            prefix = f'\t{n}\t{code}{suffix}\t'
            if stderr:
                result.append(f'{prefix}FAIL: {merge_output(stderr)}')
            else:
                result.append(f'{prefix}OK: {merge_output(stdout)}')
        result.append('\n')

    path = f'result-{level}-{max_num}.log'
    with open(path, 'wb') as fd:
        fd.write('\n'.join(result).encode('utf-8'))

#test_all(1, 16)
#test_all(2, 16)
#test_all(3, 16)
#test_all(5, 16)
nyamatongwe commented 1 year ago

To try to understand what the shell invocations are seeing, add -x to bash to enable tracing, although it interferes with failure reporting as the trace goes to stderr.

    proc = subprocess.run(['bash', '-x', path], check=False, capture_output=True)
zufuliu commented 1 year ago

OK. here is updated script and some running results:

import os
import multiprocessing
import subprocess

def generate_test(level, what, n, preamble=''):
    backslash = '\\'*((1 << level) + n)
    prefix = [preamble, 'echo ']
    suffix = []
    for i in range(level):
        count = (1 << i) - 1
        escape = backslash[:count]
        prefix.append(f'{escape}`echo ')
        suffix.append(f' {escape}`')
    prefix.append(what.format(backslash[:n]))
    prefix.extend(reversed(suffix))
    return ''.join(prefix)

def run_test(level, what, n, preamble=''):
    code = generate_test(level, what, n, preamble)
    path = f'x{level}-{n}.sh'
    with open(path, 'wb') as fd:
        fd.write(code.encode('utf-8'))
    proc = subprocess.run(['bash', '-x', path], check=False, capture_output=True)
    try:
        os.remove(path)
    except:
        pass
    return code, proc.stdout, proc.stderr

def run_test_ex(level, what, n, preamble=''):
    code, stdout, stderr = run_test(level, what, n, preamble)
    prefix = f'\t{n}\t{code}\t'
    status, suffix = 'OK', ' '
    if b'unexpected' in stderr:
        status, suffix = 'FAIL', '\n\t\t# '
    lines = (stdout + stderr).decode('utf-8').strip().splitlines()
    output = '\n\t\t# '.join(lines)
    return f'{prefix}# {status}:{suffix}{output}'

def test_all(pool, all_test, level, max_num):
    result = []
    for (what, preamble) in all_test:
        result.append('# ' + what)
        args = [(level, what, n, preamble) for n in range(max_num)]
        for output in pool.starmap(run_test_ex, args):
            result.append(output)
        result.append('\n')
    path = f'result-{level}-{max_num}.log'
    with open(path, 'wb') as fd:
        fd.write('\n'.join(result).encode('utf-8'))

def main():
    all_test = [('{}$foo', 'foo=A; '), ('{}$((1))', ''), ('{}A', ''), ('{}\"', ''), ('{}`', '')]
    with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
        test_all(pool, all_test, 1, 16)
        test_all(pool, all_test, 2, 32)
        test_all(pool, all_test, 3, 64)
        test_all(pool, all_test, 4, 128)
        test_all(pool, all_test, 5, 256)

if __name__ == '__main__':
    main()

bash-backtick.zip

zufuliu commented 1 year ago

bash-backtick-0810.zip

Attachment is update Python script, running results and patch based on following observations:

level nesting backtick dollar quotes
1 $2n + 1$ $4n - 1$ $odd(N/2)$ $even((N - 1)/2)$
2 $2n + 3$ $8n - 1$ $odd(N/4)$ $even((N - 1)/4)$
3 $2n + 7$ $16n - 1$ $odd(N/8)$ $even((N - 1)/8)$
4 $2n + 15$ $32n - 1$ $odd(N/16)$ $even((N - 1)/16)$
5 $2n + 31$ $64n - 1$ $odd(N/32)$ $even((N - 1)/32)$
k $2n + 2^k - 1$ $n\times 2^{k + 1} - 1$ $odd(N/2^k)$ $even((N - 1)/2^k)$

for $k$ level substitution with $N$ backslashes:

zufuliu commented 1 year ago

Moved if (backtickLevel != 0 block from CountDown() into Pop() for consistency. bash-backtick-0810-2.patch

zufuliu commented 1 year ago

bash-backtick-0811.zip

Updated conditions for nesting, added property lexer.bash.nested.backticks as tcsh doesn't support nested backquoted command substitution. the patch works for all generated valid test cases (as well as all valid code at https://sourceforge.net/p/scintilla/bugs/2100/, generating new test cases for 5 levels nesting is killed after long running).

level dollar quotes backtick start end
1 odd $(N/2)$ even $(N - 1)/2$ $4n - 1$ $4n + 1$ $2m + 0$
2 odd $(N/4)$ even $(N - 1)/4$ $8n - 1$ $8n + 3$ $4m + 1$
3 odd $(N/8)$ even $(N - 1)/8$ $16n - 1$ $16n + 7$ $8m + 3$
4 odd $(N/16)$ even $(N - 1)/16$ $32n - 1$ $32n + 15$ $16m + 7$
5 odd $(N/32)$ even $(N - 1)/32$ $64n - 1$ $64n + 31$ $32m + 15$
k odd $(N/2^k)$ even $(N - 1)/2^k$ $n\times 2^{k + 1} - 1$ $n\times 2^{k + 1} + 2^k - 1$ $m\times 2^k + 2^{k - 1} - 1$

for $k$ level substitution with $N$ backslashes:

zufuliu commented 1 year ago

generated test cases for 5 levels nesting: backtick-5-256.zip

zufuliu commented 1 year ago

bash-backtick-0818.patch Patch based on current LexBash.cxx.

nyamatongwe commented 1 year ago

@zufuliu How do you feel about the High Integrity C++ Coding Standard's 'do not use bitwise operators with signed operands'?

Use of signed operands with bitwise operators is in some cases subject to undefined or implementation defined behavior. Therefore, bitwise operators should only be used with operands of unsigned integral types.

Section 5.6.1 of https://www.perforce.com/resources/qac/high-integrity-cpp-coding-standard-expressions https://releases.llvm.org/14.0.0/tools/clang/tools/extra/docs/clang-tidy/checks/hicpp-signed-bitwise.html

This new code shows quite a few of these as did implementing the commandSubstitutionFlag. While some of these could be avoided by using unsigned int and unsigned literals 1U, the root backtickLevel may go negative.

Qt Creator running clang-tidy: SignedBitwise

zufuliu commented 1 year ago

hicpp-* is not enabled (here is my configuration https://github.com/zufuliu/notepad2/blob/main/.clang-tidy), it's annoying for integer literal or macro (SCE_SH_*, SC_FOLD*) that is known to not touch sign bit. backtickLevel can't be negative as decrement inside Pop() method.

Patch fixed hicpp-signed-bitwise warnings for backtickLevel related code. bash-backtick-0819.patch

nyamatongwe commented 1 year ago

bash-backtick-0819.patch inverts the meaning of lexer.bash.nested.backticks compared with earlier patches with

+       if (backtickLevel > 0 && !nestedBackticks) {
zufuliu commented 1 year ago

Typo, should be if (backtickLevel > 0 && nestedBackticks) {. bash-backtick-0819-2.patch

nyamatongwe commented 1 year ago

it's annoying for integer literal or macro (SCESH, SC_FOLD) that is known to not touch sign bit.

Yes. Stricter unsigned attribute use is commonly too difficult because of code history and subtraction expressions. The potential for unexpected results when a value is negative mean that isolated paths that can be made stricter should be made stricter.

zufuliu commented 1 year ago

here needs extra code to fix $` tests (from bash-test-khman-20230806.7z), where dollar is not escaped but don't starts nested substitution.

echo `echo \$`
echo `echo \$bar\$`
diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..ce461c61 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -388,7 +388,7 @@ public:
                // optimized to avoid track nested delimiter pairs
                style = QuoteStyle::Literal;
            }
-       } else if (sc.ch == '`') {  // $` seen in a configure script, valid?
+       } else if (sc.ch == '`' && backtickLevel == 0) {    // $` seen in a configure script, valid?
            style = QuoteStyle::Backtick;
            sc.ChangeState(SCE_SH_BACKTICKS);
        } else {

or comment out the if block:

diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..5d44bd83 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -388,9 +388,9 @@ public:
                // optimized to avoid track nested delimiter pairs
                style = QuoteStyle::Literal;
            }
-       } else if (sc.ch == '`') {  // $` seen in a configure script, valid?
-           style = QuoteStyle::Backtick;
-           sc.ChangeState(SCE_SH_BACKTICKS);
+       //} else if (sc.ch == '`' && backtickLevel == 0) {  // $` seen in a configure script, valid?
+       //  style = QuoteStyle::Backtick;
+       //  sc.ChangeState(SCE_SH_BACKTICKS);
        } else {
            // scalar has no delimiter pair
            if (!setParamStart.Contains(sc.ch)) {

bash-backticks-0820.patch bash-backticks-0820-2.patch

zufuliu commented 1 year ago

same code without backslash fails in current lexer:

echo `echo $`
echo `echo $bar$`
nyamatongwe commented 1 year ago

or comment out the if block

These produce different output so its a question of how the $ should appear.

$`ls`
# Current or with first patch
{11}$`ls`{0}
# second patch changes to
{0}${11}`ls`{0}

Here is an example from another project: https://github.com/adobe-flash/crossbridge/blob/8edfb144bfe52bee6799ad14929e3e00cb4f2226/llvm-gcc-4.2-2.9/zlib/configure#L661

eval ac_val=$`echo $ac_var`

This appears to be using $ to follow a level of indirection so the better styling for $ may be SCE_SH_DEFAULT or SCE_SH_OPERATOR.

The '$` seen in a configure script' code appeared in Scintilla 3.2.3 with this change set: https://sourceforge.net/p/scintilla/code/ci/b93c27731f4b35b726de50c191a92bfb9b204740/ for feature: https://sourceforge.net/p/scintilla/feature-requests/954/

Configure scripts may contain '$`' where the backtick doesn't appear to start a command. An example

$value .= $`;

occurs at: https://github.com/freebsd/freebsd-src/blob/ed3fb74e44b9ef37837cebc5acfd7e68867206ca/crypto/openssl/Configure#L1984

nyamatongwe commented 1 year ago
$value .= $`;

That file is actually Perl - line 1 is #! /usr/bin/env perl, so its not evidence for a '$`' that doesn't start a command.

zufuliu commented 1 year ago

bash-dollar-backtick-0821.patch Patch to remove the if block:

diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..2e6f86f2 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -388,9 +388,6 @@ public:
                // optimized to avoid track nested delimiter pairs
                style = QuoteStyle::Literal;
            }
-       } else if (sc.ch == '`') {  // $` seen in a configure script, valid?
-           style = QuoteStyle::Backtick;
-           sc.ChangeState(SCE_SH_BACKTICKS);
        } else {
            // scalar has no delimiter pair
            if (!setParamStart.Contains(sc.ch)) {

$`command` is just plain dollar concatenated with following command substitution result:

foo=1
ac_var=foo
echo $`echo $ac_var`    # output: $foo
eval ac_val=$`echo $ac_var`
echo $ac_val            # output: 1
eval val=$foo
echo $val               # output: 1
zufuliu commented 1 year ago

bash-dollar-backtick-0821-2.patch Removed corresponding comment for QuoteStyle::Backtick:

diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..7f99d0af 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -81,7 +81,7 @@ enum class QuoteStyle {
    String,         // ""
    LString,        // $""
    HereDoc,        // here document
-   Backtick,       // ``, $``
+   Backtick,       // ``
    Parameter,      // ${}
    Command,        // $()
    CommandInside,  // $() with styling inside
@@ -388,9 +388,6 @@ public:
                // optimized to avoid track nested delimiter pairs
                style = QuoteStyle::Literal;
            }
-       } else if (sc.ch == '`') {  // $` seen in a configure script, valid?
-           style = QuoteStyle::Backtick;
-           sc.ChangeState(SCE_SH_BACKTICKS);
        } else {
            // scalar has no delimiter pair
            if (!setParamStart.Contains(sc.ch)) {