Closed zufuliu closed 1 year ago
Python script to run the tests:
import os
import subprocess
def run_test(what, n, prefix=''):
code = prefix + 'echo `foo=' + '\\'*n + what + '; echo $foo`\n'
path = str(ord(what[0])) + '-' + str(n) + '.sh'
with open(path, 'wb') as fd:
fd.write(code.encode('utf-8'))
proc = subprocess.run(['bash', path], capture_output=True)
try:
os.remove(path)
except:
pass
return proc.stdout.decode('utf-8').strip(), proc.stderr.decode('utf-8').strip()
def test_all(max_num):
result = []
all_test = [('\'', ''), ('\"', ''), ('`', ''), ('$bar', 'bar=A\n'), ('$((1 + 2))', ''), ('A', '')]
for (what, prefix) in all_test:
result.append(what)
for n in range(max_num):
stdout, stderr = run_test(what, n, prefix)
if stderr:
result.append(str(n) + '\t\tfail: ' + stderr)
else:
result.append(str(n) + '\t\tok:' + stdout)
result.append('\n')
path = 'result-' + str(max_num) + '.log'
with open(path, 'wb') as fd:
fd.write('\n'.join(result).encode('utf-8'))
test_all(50)
Updated Python script to generate test for nested backquoted command substitution (above conclusions are not all correct).
import os
import subprocess
def generate_test(level, what, n, preamble=''):
backslash = '\\'*((1 << level) + n)
prefix = [preamble, 'echo ']
suffix = []
for i in range(level):
count = (1 << i) - 1
escape = backslash[:count]
prefix.append(f'{escape}`echo ')
suffix.append(f' {escape}`')
prefix.append(f'{backslash[:n]}{what}')
prefix.extend(reversed(suffix))
return ''.join(prefix)
def run_test(level, what, n, preamble=''):
code = generate_test(level, what, n, preamble)
path = f'{ord(what[0])}-{level}-{n}.sh'
with open(path, 'wb') as fd:
fd.write(code.encode('utf-8'))
proc = subprocess.run(['bash', path], check=False, capture_output=True)
try:
os.remove(path)
except:
pass
return code, proc.stdout, proc.stderr
def merge_output(output):
return '; '.join(output.decode('utf-8').strip().splitlines())
def test_all(level, max_num):
result = []
all_test = [('$bar', 'bar=A; '), ('$((1))', ''), ('A', ''), ('\'', ''), ('\"', ''), ('`', '')]
for (what, preamble) in all_test:
result.append(what)
length = len(generate_test(level, what, max_num - 1, preamble)) + 1
for n in range(max_num):
code, stdout, stderr = run_test(level, what, n, preamble)
suffix = ' '*(length - len(code))
prefix = f'\t{n}\t{code}{suffix}\t'
if stderr:
result.append(f'{prefix}FAIL: {merge_output(stderr)}')
else:
result.append(f'{prefix}OK: {merge_output(stdout)}')
result.append('\n')
path = f'result-{level}-{max_num}.log'
with open(path, 'wb') as fd:
fd.write('\n'.join(result).encode('utf-8'))
#test_all(1, 16)
#test_all(2, 16)
#test_all(3, 16)
#test_all(5, 16)
To try to understand what the shell invocations are seeing, add -x
to bash
to enable tracing, although it interferes with failure reporting as the trace goes to stderr.
proc = subprocess.run(['bash', '-x', path], check=False, capture_output=True)
OK. here is updated script and some running results:
import os
import multiprocessing
import subprocess
def generate_test(level, what, n, preamble=''):
backslash = '\\'*((1 << level) + n)
prefix = [preamble, 'echo ']
suffix = []
for i in range(level):
count = (1 << i) - 1
escape = backslash[:count]
prefix.append(f'{escape}`echo ')
suffix.append(f' {escape}`')
prefix.append(what.format(backslash[:n]))
prefix.extend(reversed(suffix))
return ''.join(prefix)
def run_test(level, what, n, preamble=''):
code = generate_test(level, what, n, preamble)
path = f'x{level}-{n}.sh'
with open(path, 'wb') as fd:
fd.write(code.encode('utf-8'))
proc = subprocess.run(['bash', '-x', path], check=False, capture_output=True)
try:
os.remove(path)
except:
pass
return code, proc.stdout, proc.stderr
def run_test_ex(level, what, n, preamble=''):
code, stdout, stderr = run_test(level, what, n, preamble)
prefix = f'\t{n}\t{code}\t'
status, suffix = 'OK', ' '
if b'unexpected' in stderr:
status, suffix = 'FAIL', '\n\t\t# '
lines = (stdout + stderr).decode('utf-8').strip().splitlines()
output = '\n\t\t# '.join(lines)
return f'{prefix}# {status}:{suffix}{output}'
def test_all(pool, all_test, level, max_num):
result = []
for (what, preamble) in all_test:
result.append('# ' + what)
args = [(level, what, n, preamble) for n in range(max_num)]
for output in pool.starmap(run_test_ex, args):
result.append(output)
result.append('\n')
path = f'result-{level}-{max_num}.log'
with open(path, 'wb') as fd:
fd.write('\n'.join(result).encode('utf-8'))
def main():
all_test = [('{}$foo', 'foo=A; '), ('{}$((1))', ''), ('{}A', ''), ('{}\"', ''), ('{}`', '')]
with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
test_all(pool, all_test, 1, 16)
test_all(pool, all_test, 2, 32)
test_all(pool, all_test, 3, 64)
test_all(pool, all_test, 4, 128)
test_all(pool, all_test, 5, 256)
if __name__ == '__main__':
main()
Attachment is update Python script, running results and patch based on following observations:
level | nesting | backtick | dollar | quotes |
---|---|---|---|---|
1 | $2n + 1$ | $4n - 1$ | $odd(N/2)$ | $even((N - 1)/2)$ |
2 | $2n + 3$ | $8n - 1$ | $odd(N/4)$ | $even((N - 1)/4)$ |
3 | $2n + 7$ | $16n - 1$ | $odd(N/8)$ | $even((N - 1)/8)$ |
4 | $2n + 15$ | $32n - 1$ | $odd(N/16)$ | $even((N - 1)/16)$ |
5 | $2n + 31$ | $64n - 1$ | $odd(N/32)$ | $even((N - 1)/32)$ |
k | $2n + 2^k - 1$ | $n\times 2^{k + 1} - 1$ | $odd(N/2^k)$ | $even((N - 1)/2^k)$ |
for $k$ level substitution with $N$ backslashes:
((N >> k) & 1) != 0
), following dollar is escaped.(((N - 1) >> k) & 1) == 0
), following quote is escaped.((N + 1) & ((1 << (k + 1)) - 1)) == 0
), following backtick is escaped.(N & 1) != 0 && N >= (1 << k) - 1
), following backtick starts nested substitution, which ended by $2m + 2^k - 1$ backslashes + backtick ($m$ and $n$ may differ). the code handle nested substitution still has bugs due to the ambiguity.Moved if (backtickLevel != 0
block from CountDown()
into Pop()
for consistency.
bash-backtick-0810-2.patch
Updated conditions for nesting, added property lexer.bash.nested.backticks
as tcsh doesn't support nested backquoted command substitution. the patch works for all generated valid test cases (as well as all valid code at https://sourceforge.net/p/scintilla/bugs/2100/, generating new test cases for 5 levels nesting is killed after long running).
level | dollar | quotes | backtick | start | end |
---|---|---|---|---|---|
1 | odd $(N/2)$ | even $(N - 1)/2$ | $4n - 1$ | $4n + 1$ | $2m + 0$ |
2 | odd $(N/4)$ | even $(N - 1)/4$ | $8n - 1$ | $8n + 3$ | $4m + 1$ |
3 | odd $(N/8)$ | even $(N - 1)/8$ | $16n - 1$ | $16n + 7$ | $8m + 3$ |
4 | odd $(N/16)$ | even $(N - 1)/16$ | $32n - 1$ | $32n + 15$ | $16m + 7$ |
5 | odd $(N/32)$ | even $(N - 1)/32$ | $64n - 1$ | $64n + 31$ | $32m + 15$ |
k | odd $(N/2^k)$ | even $(N - 1)/2^k$ | $n\times 2^{k + 1} - 1$ | $n\times 2^{k + 1} + 2^k - 1$ | $m\times 2^k + 2^{k - 1} - 1$ |
for $k$ level substitution with $N$ backslashes:
((N >> k) & 1) != 0
), following dollar is escaped.(((N - 1) >> k) & 1) == 0
), following quote is escaped.((N + 1) & ((1 << (k + 1)) - 1)) == 0
), following backtick is escaped.(N + 1) >= (1 << k)
), following backtick starts inner substitution.generated test cases for 5 levels nesting: backtick-5-256.zip
bash-backtick-0818.patch Patch based on current LexBash.cxx.
@zufuliu How do you feel about the High Integrity C++ Coding Standard's 'do not use bitwise operators with signed operands'?
Use of signed operands with bitwise operators is in some cases subject to undefined or implementation defined behavior. Therefore, bitwise operators should only be used with operands of unsigned integral types.
Section 5.6.1 of https://www.perforce.com/resources/qac/high-integrity-cpp-coding-standard-expressions https://releases.llvm.org/14.0.0/tools/clang/tools/extra/docs/clang-tidy/checks/hicpp-signed-bitwise.html
This new code shows quite a few of these as did implementing the commandSubstitutionFlag
. While some of these could be avoided by using unsigned int
and unsigned literals 1U
, the root backtickLevel
may go negative.
Qt Creator running clang-tidy:
hicpp-*
is not enabled (here is my configuration https://github.com/zufuliu/notepad2/blob/main/.clang-tidy), it's annoying for integer literal or macro (SCE_SH_*
, SC_FOLD*
) that is known to not touch sign bit.
backtickLevel
can't be negative as decrement inside Pop()
method.
Patch fixed hicpp-signed-bitwise
warnings for backtickLevel
related code.
bash-backtick-0819.patch
bash-backtick-0819.patch inverts the meaning of lexer.bash.nested.backticks
compared with earlier patches with
+ if (backtickLevel > 0 && !nestedBackticks) {
Typo, should be if (backtickLevel > 0 && nestedBackticks) {
.
bash-backtick-0819-2.patch
it's annoying for integer literal or macro (SCESH, SC_FOLD) that is known to not touch sign bit.
Yes. Stricter unsigned attribute use is commonly too difficult because of code history and subtraction expressions. The potential for unexpected results when a value is negative mean that isolated paths that can be made stricter should be made stricter.
here needs extra code to fix $`
tests (from bash-test-khman-20230806.7z), where dollar is not escaped but don't starts nested substitution.
echo `echo \$`
echo `echo \$bar\$`
diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..ce461c61 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -388,7 +388,7 @@ public:
// optimized to avoid track nested delimiter pairs
style = QuoteStyle::Literal;
}
- } else if (sc.ch == '`') { // $` seen in a configure script, valid?
+ } else if (sc.ch == '`' && backtickLevel == 0) { // $` seen in a configure script, valid?
style = QuoteStyle::Backtick;
sc.ChangeState(SCE_SH_BACKTICKS);
} else {
or comment out the if block:
diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..5d44bd83 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -388,9 +388,9 @@ public:
// optimized to avoid track nested delimiter pairs
style = QuoteStyle::Literal;
}
- } else if (sc.ch == '`') { // $` seen in a configure script, valid?
- style = QuoteStyle::Backtick;
- sc.ChangeState(SCE_SH_BACKTICKS);
+ //} else if (sc.ch == '`' && backtickLevel == 0) { // $` seen in a configure script, valid?
+ // style = QuoteStyle::Backtick;
+ // sc.ChangeState(SCE_SH_BACKTICKS);
} else {
// scalar has no delimiter pair
if (!setParamStart.Contains(sc.ch)) {
same code without backslash fails in current lexer:
echo `echo $`
echo `echo $bar$`
or comment out the if block
These produce different output so its a question of how the $
should appear.
$`ls`
# Current or with first patch
{11}$`ls`{0}
# second patch changes to
{0}${11}`ls`{0}
Here is an example from another project: https://github.com/adobe-flash/crossbridge/blob/8edfb144bfe52bee6799ad14929e3e00cb4f2226/llvm-gcc-4.2-2.9/zlib/configure#L661
eval ac_val=$`echo $ac_var`
This appears to be using $
to follow a level of indirection so the better styling for $
may be SCE_SH_DEFAULT
or SCE_SH_OPERATOR
.
The '$` seen in a configure script' code appeared in Scintilla 3.2.3 with this change set: https://sourceforge.net/p/scintilla/code/ci/b93c27731f4b35b726de50c191a92bfb9b204740/ for feature: https://sourceforge.net/p/scintilla/feature-requests/954/
Configure scripts may contain '$`' where the backtick doesn't appear to start a command. An example
$value .= $`;
$value .= $`;
That file is actually Perl - line 1 is #! /usr/bin/env perl
, so its not evidence for a '$`' that doesn't start a command.
bash-dollar-backtick-0821.patch Patch to remove the if block:
diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..2e6f86f2 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -388,9 +388,6 @@ public:
// optimized to avoid track nested delimiter pairs
style = QuoteStyle::Literal;
}
- } else if (sc.ch == '`') { // $` seen in a configure script, valid?
- style = QuoteStyle::Backtick;
- sc.ChangeState(SCE_SH_BACKTICKS);
} else {
// scalar has no delimiter pair
if (!setParamStart.Contains(sc.ch)) {
$`command`
is just plain dollar concatenated with following command substitution result:
foo=1
ac_var=foo
echo $`echo $ac_var` # output: $foo
eval ac_val=$`echo $ac_var`
echo $ac_val # output: 1
eval val=$foo
echo $val # output: 1
bash-dollar-backtick-0821-2.patch
Removed corresponding comment for QuoteStyle::Backtick
:
diff --git a/lexers/LexBash.cxx b/lexers/LexBash.cxx
index cfc22aa6..7f99d0af 100644
--- a/lexers/LexBash.cxx
+++ b/lexers/LexBash.cxx
@@ -81,7 +81,7 @@ enum class QuoteStyle {
String, // ""
LString, // $""
HereDoc, // here document
- Backtick, // ``, $``
+ Backtick, // ``
Parameter, // ${}
Command, // $()
CommandInside, // $() with styling inside
@@ -388,9 +388,6 @@ public:
// optimized to avoid track nested delimiter pairs
style = QuoteStyle::Literal;
}
- } else if (sc.ch == '`') { // $` seen in a configure script, valid?
- style = QuoteStyle::Backtick;
- sc.ChangeState(SCE_SH_BACKTICKS);
} else {
// scalar has no delimiter pair
if (!setParamStart.Contains(sc.ch)) {
Created for https://sourceforge.net/p/scintilla/bugs/2100/, not sure whether worth the complex though. https://www.gnu.org/software/bash/manual/bash.html#Command-Substitution
only backslash followed by
$
,"
,'
and`
need to be special handled, following is what observed from msys2 bash 5.2.15: more: ask more input after>
prompt. fail: unexpected EOF while looking for matching quote.4N + 2
or4N + 3
backslashes escapes dollar (bar=A
)echo `foo=$bar; echo $foo`
|A
echo `foo=\$bar; echo $foo`
|A
echo `foo=\\$bar; echo $foo`
|$bar
echo `foo=\\\$bar; echo $foo`
|$bar
echo `foo=\\\\$bar; echo $foo`
|\A
echo `foo=\\\\\$bar; echo $foo`
|\A
echo `foo=\\\\\\$bar; echo $foo`
|\$bar
echo `foo=\\\\\\\$bar; echo $foo`
|\$bar
echo `foo=\\\\\\\\$bar; echo $foo`
|\\A
echo `foo=\\\\\\\\\$bar; echo $foo`
|\\A
echo `foo=\\\\\\\\\\$bar; echo $foo`
|\\$bar
echo `foo=\\\\\\\\\\\$bar; echo $foo`
|\\$bar
4N + 1
or4N + 2
backslashes escapes double and single quoteecho `foo="; echo $foo`
echo `foo=\"; echo $foo`
| ok,"
echo `foo=\\"; echo $foo`
| ok,"
echo `foo=\\\"; echo $foo`
echo `foo=\\\\"; echo $foo`
echo `foo=\\\\\"; echo $foo`
| ok,\"
echo `foo=\\\\\\"; echo $foo`
| ok,\"
echo `foo=\\\\\\\"; echo $foo`
echo `foo=\\\\\\\\"; echo $foo`
echo `foo=\\\\\\\\\"; echo $foo`
| ok,\\"
echo `foo=\\\\\\\\\\"; echo $foo`
| ok,\\"
echo `foo=\\\\\\\\\\\"; echo $foo`
4N + 3
backslashes escapes backquoteecho `foo=`; echo $foo`
echo `foo=\`; echo $foo`
echo `foo=\\`; echo $foo`
echo `foo=\\\`; echo $foo`
| ok,`
echo `foo=\\\\`; echo $foo`
echo `foo=\\\\\`; echo $foo`
echo `foo=\\\\\\`; echo $foo`
echo `foo=\\\\\\\`; echo $foo`
| ok,\`
echo `foo=\\\\\\\\`; echo $foo`
echo `foo=\\\\\\\\\`; echo $foo`
echo `foo=\\\\\\\\\\`; echo $foo`
echo `foo=\\\\\\\\\\\`; echo $foo`
| ok,\\`