greenlion / PHP-SQL-Parser

A pure PHP SQL (non validating) parser w/ focus on MySQL dialect of SQL
BSD 3-Clause "New" or "Revised" License
601 stars 156 forks source link

Incredible slow parsing #385

Open djklim87 opened 4 months ago

djklim87 commented 4 months ago
<?php
include('vendor/autoload.php');
use PHPSQLParser\PHPSQLParser;
$parser = new PHPSQLParser();
$query = file_get_contents('/tmp/aaa/insert.sql');
$t = microtime(true);
$parsed = $parser->parse($query);
var_dump(microtime(true) - $t);
{
    ....
    "require": {
        "greenlion/php-sql-parser": "v4.6.0"
    }
}

Result will

double(58.586335897446)

It's too slow for so a insert.sql.zip small file.

djklim87 commented 4 months ago

If you modify lexer like this, it will work 20 times faster double(1.3940100669861)

<?php
/**
 * PHPSQLLexer.php
 *
 * This file contains the lexer, which splits and recombines parts of the
 * SQL statement just before parsing.
 *
 * PHP version 5
 *
 * LICENSE:
 * Copyright (c) 2010-2014 Justin Swanhart and André Rothe
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * @author    André Rothe <andre.rothe@phosco.info>
 * @copyright 2010-2014 Justin Swanhart and André Rothe
 * @license   http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
 * @version   SVN: $Id$
 *
 */

namespace PHPSQLParser\lexer;
use PHPSQLParser\exceptions\InvalidParameterException;

/**
 * This class splits the SQL string into little parts, which the parser can
 * use to build the result array.
 *
 * @author  André Rothe <andre.rothe@phosco.info>
 * @license http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
 *
 */
class PHPSQLLexer {

    protected $splitters;

    /**
     * Constructor.
     *
     * It initializes some fields.
     */
    public function __construct() {
        $this->splitters = new LexerSplitter();
    }

    /**
     * Ends the given string $haystack with the string $needle?
     *
     * @param string $haystack
     * @param string $needle
     *
     * @return boolean true, if the parameter $haystack ends with the character sequences $needle, false otherwise
     */
    protected function endsWith($haystack, $needle) {
        $length = strlen($needle);
        if ($length == 0) {
            return true;
        }
        return (substr($haystack, -$length) === $needle);
    }

    public function split($sql) {
        if (!is_string($sql)) {
            throw new InvalidParameterException($sql);
        }
        $tokens = preg_split($this->splitters->getSplittersRegexPattern(), $sql, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
        $tokens = $this->concatComments($tokens);
        $tokens = $this->concatEscapeSequences($tokens);
        $tokens = $this->balanceBackticks($tokens);
        $tokens = $this->concatColReferences($tokens);
        $tokens = $this->balanceParenthesis($tokens);
        $tokens = $this->concatUserDefinedVariables($tokens);
        $tokens = $this->concatScientificNotations($tokens);
        $tokens = $this->concatNegativeNumbers($tokens);
        return $tokens;
    }

    protected function concatNegativeNumbers($tokens) {

        $i = 0;
        $cnt = count($tokens);
        $possibleSign = true;

        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            $token = $tokens[$i];

            // a sign is also possible on the first position of the tokenlist
            if ($possibleSign === true) {
                if ($token === '-' || $token === '+') {
                    if (is_numeric($tokens[$i + 1])) {
                        $tokens[$i + 1] = $token . $tokens[$i + 1];
                        unset($tokens[$i]);
                    }
                }
                $possibleSign = false;
                continue;
            }

            // TODO: we can have sign of a number after "(" and ",", are others possible?
            if (substr($token, -1, 1) === "," || substr($token, -1, 1) === "(") {
                $possibleSign = true;
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function concatScientificNotations($tokens) {

        $i = 0;
        $cnt = count($tokens);
        $scientific = false;

        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            $token = $tokens[$i];

            if ($scientific === true) {
                if ($token === '-' || $token === '+') {
                    $tokens[$i - 1] .= $tokens[$i];
                    $tokens[$i - 1] .= $tokens[$i + 1];
                    unset($tokens[$i]);
                    unset($tokens[$i + 1]);

                } elseif (is_numeric($token)) {
                    $tokens[$i - 1] .= $tokens[$i];
                    unset($tokens[$i]);
                }
                $scientific = false;
                continue;
            }

            if (strtoupper(substr($token, -1, 1)) === 'E') {
                $scientific = is_numeric(substr($token, 0, -1));
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function concatUserDefinedVariables($tokens) {
        $i = 0;
        $cnt = count($tokens);
        $userdef = false;

        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            $token = $tokens[$i];

            if ($userdef !== false) {
                $tokens[$userdef] .= $token;
                unset($tokens[$i]);
                if ($token !== "@") {
                    $userdef = false;
                }
            }

            if ($userdef === false && $token === "@") {
                $userdef = $i;
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function concatComments($tokens) {

        $i = 0;
        $cnt = count($tokens);
        $comment = false;
        $backTicks = [];
        $in_string = false;
        $inline = false;

        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            $token = $tokens[$i];

            /*
             * Check to see if we're inside a value (i.e. back ticks).
             * If so inline comments are not valid.
             */
            if ($comment === false && $this->isBacktick($token)) {
                if (!empty($backTicks)) {
                    $lastBacktick = array_pop($backTicks);
                    if ($lastBacktick != $token) {
                        $backTicks[] = $lastBacktick; // Re-add last back tick
                        $backTicks[] = $token;
                    }
                } else {
                    $backTicks[] = $token;
                }
            }

            if($comment === false && ($token == "\"" || $token == "'")) {
                $in_string = !$in_string;
            }
            if(!$in_string) {
                if ($comment !== false) {
                    if ($inline === true && ($token === "\n" || $token === "\r\n")) {
                        $comment = false;
                    } else {
                        unset($tokens[$i]);
                        $tokens[$comment] .= $token;
                    }
                    if ($inline === false && ($token === "*/")) {
                        $comment = false;
                    }
                }

                if (($comment === false) && ($token === "--") && empty($backTicks)) {
                    $comment = $i;
                    $inline = true;
                }

                if (($comment === false) && (substr($token, 0, 1) === "#") && empty($backTicks)) {
                    $comment = $i;
                    $inline = true;
                }

                if (($comment === false) && ($token === "/*")) {
                    $comment = $i;
                    $inline = false;
                }
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function isBacktick($token) {
        return ($token === "'" || $token === "\"" || $token === "`");
    }

    protected function balanceBackticks($tokens) {
        $unsetCount = 0;
        $fullLength = sizeof($tokens);
        foreach ($tokens as $k=> $token){

            if ($unsetCount>0){
                unset($tokens[$k]);
                $unsetCount--;
                continue;
            }
            if ($this->isBacktick($token)) {
                list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
                $tokens[$k]=$token;
            }
        }

        return array_values($tokens);
    }

    // backticks are not balanced within one token, so we have
    // to re-combine some tokens
    protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {

        $shift = 0;
        $startPosition ++;

        $between[] = $char;
        for ($i = $startPosition; $i < $fullLength; $i++) {
            $token = $tokens[$i];
            $between[] = $token;
            $shift++;
            if ($token === $char) {
                break;
            }
        }

        $result = implode("", $between);

        return [$result, $shift];
    }

    /**
     * This function concats some tokens to a column reference.
     * There are two different cases:
     *
     * 1. If the current token ends with a dot, we will add the next token
     * 2. If the next token starts with a dot, we will add it to the previous token
     *
     */
    protected function concatColReferences($tokens) {

        $cnt = count($tokens);
        $i = 0;
        while ($i < $cnt) {

            if (!isset($tokens[$i])) {
                $i++;
                continue;
            }

            if ($tokens[$i][0] === ".") {

                // concat the previous tokens, till the token has been changed
                $k = $i - 1;
                $len = strlen($tokens[$i]);
                while (($k >= 0) && ($len == strlen($tokens[$i]))) {
                    if (!isset($tokens[$k])) { // FIXME: this can be wrong if we have schema . table . column
                        $k--;
                        continue;
                    }
                    $tokens[$i] = $tokens[$k] . $tokens[$i];
                    unset($tokens[$k]);
                    $k--;
                }
            }

            if ($this->endsWith($tokens[$i], '.') && !is_numeric($tokens[$i])) {

                // concat the next tokens, till the token has been changed
                $k = $i + 1;
                $len = strlen($tokens[$i]);
                while (($k < $cnt) && ($len == strlen($tokens[$i]))) {
                    if (!isset($tokens[$k])) {
                        $k++;
                        continue;
                    }
                    $tokens[$i] .= $tokens[$k];
                    unset($tokens[$k]);
                    $k++;
                }
            }

            $i++;
        }

        return array_values($tokens);
    }

    protected function concatEscapeSequences($tokens) {
        $tokenCount = count($tokens);
        $i = 0;
        while ($i < $tokenCount) {

            if ($this->endsWith($tokens[$i], "\\")) {
                $i++;
                if (isset($tokens[$i])) {
                    $tokens[$i - 1] .= $tokens[$i];
                    unset($tokens[$i]);
                }
            }
            $i++;
        }
        return array_values($tokens);
    }

    protected function balanceParenthesis($tokens) {
        $token_count = count($tokens);
        $i = 0;
        while ($i < $token_count) {
            if ($tokens[$i] !== '(') {
                $i++;
                continue;
            }
            $count = 1;
            for ($n = $i + 1; $n < $token_count; $n++) {
                $token = $tokens[$n];
                if ($token === '(') {
                    $count++;
                }
                if ($token === ')') {
                    $count--;
                }
                $tokens[$i] .= $token;
                unset($tokens[$n]);
                if ($count === 0) {
                    $n++;
                    break;
                }
            }
            $i = $n;
        }
        return array_values($tokens);
    }
}

?>
joebordes commented 4 months ago

Interesting. @djklim87, Did you run the unit tests after the change?

This is the diff

diff --git a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
index 8faf5574c..b27bcd15a 100644
--- a/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
+++ b/vendor/greenlion/php-sql-parser/src/PHPSQLParser/lexer/PHPSQLLexer.php
@@ -283,51 +283,43 @@ class PHPSQLLexer {
     }

     protected function balanceBackticks($tokens) {
-        $i = 0;
-        $cnt = count($tokens);
-        while ($i < $cnt) {
+        $unsetCount = 0;
+        $fullLength = sizeof($tokens);
+        foreach ($tokens as $k=> $token){

-            if (!isset($tokens[$i])) {
-                $i++;
+            if ($unsetCount>0){
+                unset($tokens[$k]);
+                $unsetCount--;
                 continue;
             }
-
-            $token = $tokens[$i];
-
             if ($this->isBacktick($token)) {
-                $tokens = $this->balanceCharacter($tokens, $i, $token);
+                list($token, $unsetCount) = $this->balanceCharacter($tokens, $k, $token, $fullLength);
+                $tokens[$k]=$token;
             }
-
-            $i++;
         }

-        return $tokens;
+        return array_values($tokens);
     }

     // backticks are not balanced within one token, so we have
     // to re-combine some tokens
-    protected function balanceCharacter($tokens, $idx, $char) {
+    protected function balanceCharacter($tokens, $startPosition, $char, $fullLength) {

-        $token_count = count($tokens);
-        $i = $idx + 1;
-        while ($i < $token_count) {
-
-            if (!isset($tokens[$i])) {
-                $i++;
-                continue;
-            }
+        $shift = 0;
+        $startPosition ++;

+        $between[] = $char;
+        for ($i = $startPosition; $i < $fullLength; $i++) {
             $token = $tokens[$i];
-            $tokens[$idx] .= $token;
-            unset($tokens[$i]);
-
+            $between[] = $token;
+            $shift++;
             if ($token === $char) {
                 break;
             }
-
-            $i++;
         }
-        return array_values($tokens);
+        $result = implode("", $between);
+
+        return [$result, $shift];
     }

     /**

would be nice if someone could validate that. I will try to find some time to test it and share.

Thanks HTH

joebordes commented 4 months ago

unit tests are passing

joebordes commented 4 months ago

I created a phpbench script

<?php
include 'vendor/autoload.php';
use PHPSQLParser\PHPSQLParser;

class ParseInsertBench {

    /**
    * @Revs(20)
    * @Iterations(5)
    */
    public function benchparseInsert() {
        $parser = new PHPSQLParser();
        $query = file_get_contents('insert.sql');
        $parser->parse($query);
    }

    /**
    * @Revs(20)
    * @Iterations(5)
    */
    public function benchparseQuery() {
        $parser = new PHPSQLParser();
        $query = file_get_contents('query.sql');
        $parser->parse($query);
    }
}

BEFORE

./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌

\ParseInsertBench

    benchparseInsert........................I4 - Mo2.583m (±0.51%)
    benchparseQuery.........................I4 - Mo9.289ms (±1.21%)

Subjects: 2, Assertions: 0, Failures: 0, Errors: 0

AFTER

./phpbench.phar run testspeed.php
PHPBench (1.2.15) running benchmarks... #standwithukraine
with PHP version 8.2.15, xdebug ✔, opcache ❌

\ParseInsertBench

    benchparseInsert........................I4 - Mo1.966s (±0.71%)
    benchparseQuery.........................I4 - Mo9.634ms (±4.71%)

Subjects: 2, Assertions: 0, Failures: 0, Errors: 0

The difference is significant on the insert SQL but seems worse on the query SQL.

HTH

djklim87 commented 4 months ago

@joebordes can you attach to this issue content of query.sql?

joebordes commented 4 months ago
select DISTINCT vtiger_invoice.subject AS 'Invoice_Subject', (CASE WHEN vtiger_invoice.salesorderid NOT LIKE '' THEN (CASE WHEN vtiger_salesorderInvoice.subject NOT LIKE '' THEN vtiger_salesorderInvoice.subject ELSE '' END) ELSE '' END) AS Invoice_Sales_Order, vtiger_invoice.customerno AS 'Invoice_Customer_No', vtiger_invoice.exciseduty AS 'Invoice_Excise_Duty', vtiger_invoice.salescommission AS 'Invoice_Sales_Commission', (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN (CASE WHEN vtiger_accountInvoice.accountname NOT LIKE '' THEN vtiger_accountInvoice.accountname ELSE '' END) ELSE '' END) AS Invoice_Account_Name, vtiger_invoicebillads.bill_street AS 'Invoice_Billing_Address', vtiger_invoiceshipads.ship_street AS 'Invoice_Shipping_Address', (CASE WHEN vtiger_assets.product NOT LIKE '' THEN (CASE WHEN vtiger_productsRelAssets603.productname NOT LIKE '' THEN vtiger_productsRelAssets603.productname ELSE '' END) ELSE '' END) AS Assets_Product_Name, vtiger_assets.serialnumber AS 'Assets_Serial_Number', vtiger_contactsubdetails.assistant AS 'Contacts_Assistant', (CASE WHEN vtiger_contactdetails.reportsto NOT LIKE '' THEN (CASE WHEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) NOT LIKE '' THEN CONCAT(vtiger_contactdetailsContacts.firstname,' ',vtiger_contactdetailsContacts.lastname) ELSE '' END) ELSE '' END) AS Contacts_Reports_To, vtiger_contactsubdetails.assistantphone AS 'Contacts_Assistant_Phone', vtiger_crmentity.crmid AS "LBL_ACTION" from vtiger_invoice inner join vtiger_crmentity on vtiger_crmentity.crmid=vtiger_invoice.invoiceid left join vtiger_groups on vtiger_groups.groupid = vtiger_crmentity.smownerid left join vtiger_users on vtiger_users.id = vtiger_crmentity.smownerid inner join vtiger_invoicebillads on vtiger_invoice.invoiceid=vtiger_invoicebillads.invoicebilladdressid inner join vtiger_invoiceshipads on vtiger_invoice.invoiceid=vtiger_invoiceshipads.invoiceshipaddressid left join vtiger_salesorder as vtiger_salesorderInvoice on vtiger_salesorderInvoice.salesorderid=vtiger_invoice.salesorderid left join vtiger_account as vtiger_accountInvoice on vtiger_accountInvoice.accountid = vtiger_invoice.accountid left join vtiger_reptmptbl_16632c2b68258b2627337190 as vtiger_assets on vtiger_assets.invoiceid=vtiger_invoice.invoiceid left join vtiger_crmentity as vtiger_crmentityAssets on vtiger_crmentityAssets.crmid = vtiger_assets.assetsid AND vtiger_crmentityAssets.deleted=0 left join vtiger_crmentity as vtiger_crmentityRelAssets0 on vtiger_crmentityRelAssets0.crmid = vtiger_assets.product and vtiger_crmentityRelAssets0.deleted=0 left join vtiger_products as vtiger_productsRelAssets603 on vtiger_productsRelAssets603.productid = vtiger_crmentityRelAssets0.crmid left join vtiger_reptmptbl_16632c2b6839a95870061631 as vtiger_contactdetails on vtiger_invoice.contactid=vtiger_contactdetails.contactid left join vtiger_crmentity as vtiger_crmentityContacts on vtiger_crmentityContacts.crmid = vtiger_contactdetails.contactid AND vtiger_crmentityContacts.deleted=0 left join vtiger_contactdetails as vtiger_contactdetailsContacts on vtiger_contactdetailsContacts.contactid = vtiger_contactdetails.reportsto left join vtiger_contactsubdetails on vtiger_contactdetails.contactid = vtiger_contactsubdetails.contactsubscriptionid WHERE vtiger_invoice.invoiceid>0 AND vtiger_crmentity.deleted=0  limit 0, 40