Open djklim87 opened 4 months ago
If you modify lexer like this, it will work 20 times faster double(1.3940100669861)
<?php
/**
* PHPSQLLexer.php
*
* This file contains the lexer, which splits and recombines parts of the
* SQL statement just before parsing.
*
* PHP version 5
*
* LICENSE:
* Copyright (c) 2010-2014 Justin Swanhart and André Rothe
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* @author André Rothe <andre.rothe@phosco.info>
* @copyright 2010-2014 Justin Swanhart and André Rothe
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
* @version SVN: $Id$
*
*/
namespace PHPSQLParser\lexer;
use PHPSQLParser\exceptions\InvalidParameterException;
/**
* This class splits the SQL string into little parts, which the parser can
* use to build the result array.
*
* @author André Rothe <andre.rothe@phosco.info>
* @license http://www.debian.org/misc/bsd.license BSD License (3 Clause)
*
*/
class PHPSQLLexer {
protected $splitters;
/**
* Constructor.
*
* It initializes some fields.
*/
public function __construct() {
$this->splitters = new LexerSplitter();
}
/**
* Ends the given string $haystack with the string $needle?
*
* @param string $haystack
* @param string $needle
*
* @return boolean true, if the parameter $haystack ends with the character sequences $needle, false otherwise
*/
protected function endsWith($haystack, $needle) {
$length = strlen($needle);
if ($length == 0) {
return true;
}
return (substr($haystack, -$length) === $needle);
}
public function split($sql) {
if (!is_string($sql)) {
throw new InvalidParameterException($sql);
}
$tokens = preg_split($this->splitters->getSplittersRegexPattern(), $sql, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
$tokens = $this->concatComments($tokens);
$tokens = $this->concatEscapeSequences($tokens);
$tokens = $this->balanceBackticks($tokens);
$tokens = $this->concatColReferences($tokens);
$tokens = $this->balanceParenthesis($tokens);
$tokens = $this->concatUserDefinedVariables($tokens);
$tokens = $this->concatScientificNotations($tokens);
$tokens = $this->concatNegativeNumbers($tokens);
return $tokens;
}
protected function concatNegativeNumbers($tokens) {
$i = 0;
$cnt = count($tokens);
$possibleSign = true;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
// a sign is also possible on the first position of the tokenlist
if ($possibleSign === true) {
if ($token === '-' || $token === '+') {
if (is_numeric($tokens[$i + 1])) {
$tokens[$i + 1] = $token . $tokens[$i + 1];
unset($tokens[$i]);
}
}
$possibleSign = false;
continue;
}
// TODO: we can have sign of a number after "(" and ",", are others possible?
if (substr($token, -1, 1) === "," || substr($token, -1, 1) === "(") {
$possibleSign = true;
}
$i++;
}
return array_values($tokens);
}
protected function concatScientificNotations($tokens) {
$i = 0;
$cnt = count($tokens);
$scientific = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
if ($scientific === true) {
if ($token === '-' || $token === '+') {
$tokens[$i - 1] .= $tokens[$i];
$tokens[$i - 1] .= $tokens[$i + 1];
unset($tokens[$i]);
unset($tokens[$i + 1]);
} elseif (is_numeric($token)) {
$tokens[$i - 1] .= $tokens[$i];
unset($tokens[$i]);
}
$scientific = false;
continue;
}
if (strtoupper(substr($token, -1, 1)) === 'E') {
$scientific = is_numeric(substr($token, 0, -1));
}
$i++;
}
return array_values($tokens);
}
protected function concatUserDefinedVariables($tokens) {
$i = 0;
$cnt = count($tokens);
$userdef = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
if ($userdef !== false) {
$tokens[$userdef] .= $token;
unset($tokens[$i]);
if ($token !== "@") {
$userdef = false;
}
}
if ($userdef === false && $token === "@") {
$userdef = $i;
}
$i++;
}
return array_values($tokens);
}
protected function concatComments($tokens) {
$i = 0;
$cnt = count($tokens);
$comment = false;
$backTicks = [];
$in_string = false;
$inline = false;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
$token = $tokens[$i];
/*
* Check to see if we're inside a value (i.e. back ticks).
* If so inline comments are not valid.
*/
if ($comment === false && $this->isBacktick($token)) {
if (!empty($backTicks)) {
$lastBacktick = array_pop($backTicks);
if ($lastBacktick != $token) {
$backTicks[] = $lastBacktick; // Re-add last back tick
$backTicks[] = $token;
}
} else {
$backTicks[] = $token;
}
}
if($comment === false && ($token == "\"" || $token == "'")) {
$in_string = !$in_string;
}
if(!$in_string) {
if ($comment !== false) {
if ($inline === true && ($token === "\n" || $token === "\r\n")) {
$comment = false;
} else {
unset($tokens[$i]);
$tokens[$comment] .= $token;
}
if ($inline === false && ($token === "*/")) {
$comment = false;
}
}
if (($comment === false) && ($token === "--") && empty($backTicks)) {
$comment = $i;
$inline = true;
}
if (($comment === false) && (substr($token, 0, 1) === "#") && empty($backTicks)) {
$comment = $i;
$inline = true;
}
if (($comment === false) && ($token === "/*")) {
$comment = $i;
$inline = false;
}
}
$i++;
}
return array_values($tokens);
}
protected function isBacktick($token) {
return ($token === "'" || $token === "\"" || $token === "`");
}
protected function balanceBackticks($tokens) {
$unsetCount = 0;
$fullLength = sizeof($tokens);
foreach ($tokens as $k=> $token){
if ($unsetCount>0){
unset($tokens[$k]);
$unsetCount--;
continue;
}
if ($this->isBacktick($token)) {
list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
$tokens[$k]=$token;
}
}
return array_values($tokens);
}
// backticks are not balanced within one token, so we have
// to re-combine some tokens
protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {
$shift = 0;
$startPosition ++;
$between[] = $char;
for ($i = $startPosition; $i < $fullLength; $i++) {
$token = $tokens[$i];
$between[] = $token;
$shift++;
if ($token === $char) {
break;
}
}
$result = implode("", $between);
return [$result, $shift];
}
/**
* This function concats some tokens to a column reference.
* There are two different cases:
*
* 1. If the current token ends with a dot, we will add the next token
* 2. If the next token starts with a dot, we will add it to the previous token
*
*/
protected function concatColReferences($tokens) {
$cnt = count($tokens);
$i = 0;
while ($i < $cnt) {
if (!isset($tokens[$i])) {
$i++;
continue;
}
if ($tokens[$i][0] === ".") {
// concat the previous tokens, till the token has been changed
$k = $i - 1;
$len = strlen($tokens[$i]);
while (($k >= 0) && ($len == strlen($tokens[$i]))) {
if (!isset($tokens[$k])) { // FIXME: this can be wrong if we have schema . table . column
$k--;
continue;
}
$tokens[$i] = $tokens[$k] . $tokens[$i];
unset($tokens[$k]);
$k--;
}
}
if ($this->endsWith($tokens[$i], '.') && !is_numeric($tokens[$i])) {
// concat the next tokens, till the token has been changed
$k = $i + 1;
$len = strlen($tokens[$i]);
while (($k < $cnt) && ($len == strlen($tokens[$i]))) {
if (!isset($tokens[$k])) {
$k++;
continue;
}
$tokens[$i] .= $tokens[$k];
unset($tokens[$k]);
$k++;
}
}
$i++;
}
return array_values($tokens);
}
protected function concatEscapeSequences($tokens) {
$tokenCount = count($tokens);
$i = 0;
while ($i < $tokenCount) {
if ($this->endsWith($tokens[$i], "\\")) {
$i++;
if (isset($tokens[$i])) {
$tokens[$i - 1] .= $tokens[$i];
unset($tokens[$i]);
}
}
$i++;
}
return array_values($tokens);
}
protected function balanceParenthesis($tokens) {
$token_count = count($tokens);
$i = 0;
while ($i < $token_count) {
if ($tokens[$i] !== '(') {
$i++;
continue;
}
$count = 1;
for ($n = $i + 1; $n < $token_count; $n++) {
$token = $tokens[$n];
if ($token === '(') {
$count++;
}
if ($token === ')') {
$count--;
}
$tokens[$i] .= $token;
unset($tokens[$n]);
if ($count === 0) {
$n++;
break;
}
}
$i = $n;
}
return array_values($tokens);
}
}
?>
Interesting. @djklim87, Did you run the unit tests after the change?
This is the diff
diff --git a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
index 8faf5574c..b27bcd15a 100644
--- a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
+++ b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
@@ -283,51 +283,43 @@ class PHPSQLLexer {
}
protected function balanceBackticks($tokens) {
- $i = 0;
- $cnt = count($tokens);
- while ($i < $cnt) {
+ $unsetCount = 0;
+ $fullLength = sizeof($tokens);
+ foreach ($tokens as $k=> $token){
- if (!isset($tokens[$i])) {
- $i++;
+ if ($unsetCount>0){
+ unset($tokens[$k]);
+ $unsetCount--;
continue;
}
-
- $token = $tokens[$i];
-
if ($this->isBacktick($token)) {
- $tokens = $this->balanceCharacter($tokens, $i, $token);
+ list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
+ $tokens[$k]=$token;
}
-
- $i++;
}
- return $tokens;
+ return array_values($tokens);
}
// backticks are not balanced within one token, so we have
// to re-combine some tokens
- protected function balanceCharacter($tokens, $idx, $char) {
+ protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {
- $token_count = count($tokens);
- $i = $idx + 1;
- while ($i < $token_count) {
-
- if (!isset($tokens[$i])) {
- $i++;
- continue;
- }
+ $shift = 0;
+ $startPosition ++;
+ $between[] = $char;
+ for ($i = $startPosition; $i < $fullLength; $i++) {
$token = $tokens[$i];
- $tokens[$idx] .= $token;
- unset($tokens[$i]);
-
+ $between[] = $token;
+ $shift++;
if ($token === $char) {
break;
}
-
- $i++;
}
- return array_values($tokens);
+ $result = implode("", $between);
+
+ return [$result, $shift];
}
/**
would be nice if someone could validate that. I will try to find some time to test it and share.
Thanks HTH
unit tests are passing
I created a phpbench script
<?php
include 'vendor/autoload.php';
use PHPSQLParser\PHPSQLParser;
class ParseInsertBench {
/**
* @Revs(20)
* @Iterations(5)
*/
public function benchparseInsert() {
$parser = new PHPSQLParser();
$query = file_get_contents('insert.sql');
$parser->parse($query);
}
/**
* @Revs(20)
* @Iterations(5)
*/
public function benchparseQuery() {
$parser = new PHPSQLParser();
$query = file_get_contents('query.sql');
$parser->parse($query);
}
}
BEFORE
./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌
\ParseInsertBench
benchparseInsert........................I4 - Mo2.583m (±0.51%)
benchparseQuery.........................I4 - Mo9.289ms (±1.21%)
Subjects: 2, Assertions: 0, Failures: 0, Errors: 0
AFTER
./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌
\ParseInsertBench
benchparseInsert........................I4 - Mo1.966s (±0.71%)
benchparseQuery.........................I4 - Mo9.634ms (±4.71%)
Subjects: 2, Assertions: 0, Failures: 0, Errors: 0
The difference is significant on the insert SQL but seems worse on the query SQL.
HTH
@joebordes can you attach to this issue content of query.sql
?
select DISTINCT vtiger_invoice.subject AS 'Invoice_Subject', (CASE WHEN vtiger_invoice.salesorderid NOT LIKE '' THEN (CASE WHEN vtiger_salesorderInvoice.subject NOT LIKE '' THEN vtiger_salesorderInvoice.subject ELSE '' END) ELSE '' END) AS Invoice_Sales_Order, vtiger_invoice.customerno AS 'Invoice_Customer_No', vtiger_invoice.exciseduty AS 'Invoice_Excise_Duty', vtiger_invoice.salescommission AS 'Invoice_Sales_Commission', (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN vtiger_accountInvoice.accountname ELSE '' END) ELSE '' END) AS Invoice_Account_Name, vtiger_invoicebillads.bill_street AS 'Invoice_Billing_Address', vtiger_invoiceshipads.ship_street AS 'Invoice_Shipping_Address', (CASE WHEN vtiger_assets.product NOT LIKE '' THEN (CASE WHEN vtiger_productsRelAssets603.productname NOT LIKE '' THEN vtiger_productsRelAssets603.productname ELSE '' END) ELSE '' END) AS Assets_Product_Name, vtiger_assets.serialnumber AS 'Assets_Serial_Number', vtiger_contactsubdetails.assistant AS 'Contacts_Assistant', (CASE WHEN vtiger_contactdetails.reportsto NOT LIKE '' THEN (CASE WHEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) NOT LIKE '' THEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) ELSE '' END) ELSE '' END) AS Contacts_Reports_To, vtiger_contactsubdetails.assistantphone AS 'Contacts_Assistant_Phone', vtiger_crmentity.crmid AS "LBL_ACTION" from vtiger_invoice inner join vtiger_crmentity on vtiger_crmentity.crmid=vtiger_invoice.invoiceid left join vtiger_groups on vtiger_groups.groupid = vtiger_crmentity.smownerid left join vtiger_users on vtiger_users.id = vtiger_crmentity.smownerid inner join vtiger_invoicebillads on vtiger_invoice.invoiceid=vtiger_invoicebillads.invoicebilladdressid inner join vtiger_invoiceshipads on vtiger_invoice.invoiceid=vtiger_invoiceshipads.invoiceshipaddressid left join vtiger_salesorder as vtiger_salesorderInvoice on vtiger_salesorderInvoice.salesorderid=vtiger_invoice.salesorderid left join vtiger_account as vtiger_accountInvoice on vtiger_accountInvoice.accountid = vtiger_invoice.accountid left join vtiger_reptmptbl_16632c2b68258b2627337190 as vtiger_assets on vtiger_assets.invoiceid=vtiger_invoice.invoiceid left join vtiger_crmentity as vtiger_crmentityAssets on vtiger_crmentityAssets.crmid = vtiger_assets.assetsid AND vtiger_crmentityAssets.deleted=0 left join vtiger_crmentity as vtiger_crmentityRelAssets0 on vtiger_crmentityRelAssets0.crmid = vtiger_assets.product and vtiger_crmentityRelAssets0.deleted=0 left join vtiger_products as vtiger_productsRelAssets603 on vtiger_productsRelAssets603.productid = vtiger_crmentityRelAssets0.crmid left join vtiger_reptmptbl_16632c2b6839a95870061631 as vtiger_contactdetails on vtiger_invoice.contactid=vtiger_contactdetails.contactid left join vtiger_crmentity as vtiger_crmentityContacts on vtiger_crmentityContacts.crmid = vtiger_contactdetails.contactid AND vtiger_crmentityContacts.deleted=0 left join vtiger_contactdetails as vtiger_contactdetailsContacts on vtiger_contactdetailsContacts.contactid = vtiger_contactdetails.reportsto left join vtiger_contactsubdetails on vtiger_contactdetails.contactid = vtiger_contactsubdetails.contactsubscriptionid WHERE vtiger_invoice.invoiceid>0 AND vtiger_crmentity.deleted=0 limit 0, 40
Result will
It's too slow for so a insert.sql.zip small file.