Found during work when using BEX to parse our JSP files. Below is the fix I found. I'll also add some unit tests
public static ImmutableIntRangeMap<ParsingState> parseJSPTextStates(final CharSequence text) {
// TODO: used Java as a basic and need to enhance
// For example, to handle JSP Expression
// https://www.tutorialspoint.com/jsp/jsp_syntax.htm
// TODO: need to make RangeMap class and correctly and nested ranges
// Currently, doesn't work as expected
// "stuff <%= expression%> more stuff"
// "More stuff" after the expression should be seen as part of the String literal,
// but isn't since it gets the last range, which is the expression, which is over
// Think can fix by end the state when go into a inner state
// Then, when leave inner state, start a new state based on the outer state
// TODO: make RangeMap class to handle this
// When adding a new record, check for overlap using the below logic
// + An overlap occurs if and only if
// a) The added range's start in part of an existing range
// * Can check by finding existing range in map and seeing if the added range's start is in the middle
// * BEXUtilities.getEntryInRanges
// b) An existing range's start is contained in the new range
// * Can do a subRange check on the existing NavigableMap and see if there are any entries
// If there's an overlap, handle by breaking apart ranges in pieces
// Parse text to get states
// * Block comment
// * Line comment
// * In String literal
// * Other stuff?
// Reference: https://www.tutorialspoint.com/jsp/jsp_syntax.htm
ImmutableIntRangeMap.Builder<ParsingState> builder = ImmutableIntRangeMap.builder();
ArrayDeque<ParsingState> stateStack = new ArrayDeque<>();
ArrayDeque<Integer> startTextInfoStack = new ArrayDeque<>();
ArrayDeque<Integer> parentStartStack = new ArrayDeque<>();
boolean isJava = false;
// HTML tag
boolean isTag = false;
// TODO: should I refactor and use this? how would I use it?
// String expectedEnd = "";
for (int i = 0; i < text.length(); i++) {
// if (i == 50) {
// System.out.println("Debug");
// }
char c = text.charAt(i);
// System.out.printf("Index %s%n"
// + "Char %s%n"
// + "States %s%n"
// + "Start %s%n"
// + "Parent %s%n", i, c, stateStack, startTextInfoStack, parentStartStack);
ParsingState currentState = unwrapParsingState(stateStack.peek());
// if (currentState == null) {
// System.out.println("Parent: " + i);
// }
if (currentState == IN_STRING_LITERAL) {
if (c == '\\') {
// Escape next character
if (nextChar(text, i) == '\0') {
break;
}
i++;
} else if (c == '"') {
popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
} else if (isTag && hasText(text, i, "<%=")) {
pushNextLevelParsingState(IN_EXPRESSION_BLOCK, i, builder, stateStack, startTextInfoStack,
parentStartStack);
i += 2;
isJava = true;
}
// Other characters don't matter??
// TODO: handle unicode and other escaping in String literal
} else if (currentState == IN_SECONDARY_STRING_LITERAL) {
if (c == '\\') {
// Escape next character
if (nextChar(text, i) == '\0') {
break;
}
i++;
} else if (c == '\'') {
popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
} else if (hasText(text, i, "<%=")) {
pushNextLevelParsingState(IN_EXPRESSION_BLOCK, i, builder, stateStack, startTextInfoStack,
parentStartStack);
i += 2;
}
// Other characters don't matter??
// TODO: handle unicode and other escaping in String literal
// TODO: Java comments only valid in <% code block %>
} else if (isJava && hasText(text, i, "%>")) {
isJava = false;
// System.out.println("Current: " + currentState
// + "\t"
// + i);
if (currentState != IN_EXPRESSION_BLOCK) {
// End the current state on the prior character
popParsingState(i - 1, builder, stateStack, startTextInfoStack, parentStartStack);
}
i++;
popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
} else if (isJava && currentState == IN_LINE_COMMENT) {
if (c == '\n' || c == '\r') {
popParsingState(i - 1, builder, stateStack, startTextInfoStack, parentStartStack);
i = handleLineTerminator(i, c, text, builder, stateStack, startTextInfoStack, parentStartStack);
// int startTextInfo = startTextInfoStack.pop();
// builder.put(IntBEXRange.of(startTextInfo, i), stateStack.pop());
}
// Other characters don't matter?
} else if (isJava && currentState == IN_MULTILINE_COMMENT) {
if (hasText(text, i, "*/")) {
i++;
popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
// int startTextInfo = startTextInfoStack.pop();
// builder.put(IntBEXRange.closed(startTextInfo, i), stateStack.pop());
}
} else if (currentState == IN_MULTILINE_COMMENT) {
if (hasText(text, i, "--%>")) {
i += 3;
popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
}
} else if (currentState == IN_SECONDARY_MULTILINE_COMMENT) {
if (hasText(text, i, "-->")) {
i += 2;
popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
}
} else if (isJava && c == '/' && nextChar(text, i) == '/') {
pushNextLevelParsingState(IN_LINE_COMMENT, i, builder, stateStack, startTextInfoStack,
parentStartStack);
i++;
} else if (isJava && c == '/' && nextChar(text, i) == '*') {
pushNextLevelParsingState(IN_MULTILINE_COMMENT, i, builder, stateStack, startTextInfoStack,
parentStartStack);
i++;
} else if (c == '"' && isTag) {
pushNextLevelParsingState(IN_STRING_LITERAL, i, builder, stateStack, startTextInfoStack,
parentStartStack);
} else if (c == '\'' && isTag) {
pushNextLevelParsingState(IN_SECONDARY_STRING_LITERAL, i, builder, stateStack,
startTextInfoStack, parentStartStack);
} else if (c == '"' && isJava) {
pushParsingState(IN_STRING_LITERAL, i, stateStack, startTextInfoStack, parentStartStack);
} else if (c == '\'' && isJava) {
pushParsingState(IN_SECONDARY_STRING_LITERAL, i, stateStack, startTextInfoStack, parentStartStack);
} else if (hasText(text, i, "<%--")) {
pushParsingState(IN_MULTILINE_COMMENT, i, stateStack, startTextInfoStack, parentStartStack);
i += 3;
} else if (hasText(text, i, "<!--")) {
pushParsingState(IN_SECONDARY_MULTILINE_COMMENT, i, stateStack, startTextInfoStack, parentStartStack);
i += 3;
} else if (hasText(text, i, "<%=")) {
// In Java expression
pushParsingState(IN_EXPRESSION_BLOCK, i, stateStack, startTextInfoStack, parentStartStack);
i += 2;
isJava = true;
} else if (hasText(text, i, "<%!")) {
pushParsingState(IN_EXPRESSION_BLOCK, i, stateStack, startTextInfoStack, parentStartStack);
i += 2;
isJava = true;
} else if (hasText(text, i, "<%")) {
// In Java scriptlet
pushParsingState(IN_EXPRESSION_BLOCK, i, stateStack, startTextInfoStack, parentStartStack);
i++;
isJava = true;
} else if (c == '<' && !isJava && !isTag) {
pushParsingState(IN_TAG, i, stateStack, startTextInfoStack, parentStartStack);
isTag = true;
} else if (c == '>' && isTag && !isJava) {
isTag = false;
popParsingState(i, builder, stateStack, startTextInfoStack, parentStartStack);
} else if (Character.isWhitespace(c)) {
i = handleWhitespace(i, c, text, builder, stateStack, startTextInfoStack, parentStartStack);
}
}
if (!stateStack.isEmpty()) {
// TODO: what if there are multiple entries?
// (this would suggest improperly formatted code)
int startTextInfo = startTextInfoStack.pop();
// TODO: does there need to be a parent?
if (startTextInfo != text.length()) {
builder.put(IntBEXRange.of(startTextInfo, text.length()), stateStack.pop());
}
}
return builder.build();
}
Found during work when using BEX to parse our JSP files. Below is the fix I found. I'll also add some unit tests