silverstripe / silverstripe-framework

Silverstripe Framework, the MVC framework that powers Silverstripe CMS
https://www.silverstripe.org
BSD 3-Clause "New" or "Revised" License
722 stars 822 forks source link

Alternate implementation for ClassName column #11358

Closed emteknetnz closed 3 days ago

emteknetnz commented 1 week ago

Very large sites with huge databases tables can take a very long time to deploy due to a slow dev/build

A known performance issue on these sorts of sites is the use of an Enum field to represent the polymorhpic ClassName on DataObject tables, which often needs to be updated via an ALTER TABLE query that performs very poorly at scale

Enums do have some advantages though:

A couple of possible alternative implementations:

It might be worth putting some abstraction around how the ClassName column is implemented, defaulting to enum and letting projects switch to a different implementation as required

Notes

Acceptance criteria (added after implementation)

New issues created

Kitchen sink CI - using DBClassNameVarchar instead of DBClassName

PRs

emteknetnz commented 1 week ago

Some basic test classes to look at enum performance on large databases

<?php

use SilverStripe\ORM\DataObject;

class MyBaseDataObject extends DataObject
{
    private static $table_name = 'MyBaseDataObject';

    private static $db = [
        'Title' => 'Varchar'
    ];
}
<?php

use SilverStripe\ORM\DataObject;
use SilverStripe\ORM\DB;

class MyDataObjectA extends MyBaseDataObject
{
    private static $table_name = 'MyDataObjectA';

    private static $db = [
        'SomeField' => 'Varchar'
    ];

    public function requireDefaultRecords()
    {
        $num = 2000000; // total number of records that should exist (note cannot decrease)
        $insert = 100000; // max inserts per query
        $count = self::get()->count();
        $diff = $num - $count;
        $t = '2024-08-29 17:47:10';
        parent::requireDefaultRecords();
        if ($diff > 0) {
            # use raw SQL to make insertions much faster
            $loops = ceil($diff / $insert);
            for ($loop = 0; $loop < $loops; $loop++) {
                $sqlBase = [];
                $sqlA = [];
                for ($i = 1; $i <= $insert && $i <= $diff; $i++) {
                    $id = $count + ($loop * $insert) + $i;
                    $sqlBase[] = "($id, 'MyDataObjectA', 'My Data Object $id', '$t', '$t')";
                    $sqlA[] = "($id, 'Some value $id')";
                }
                if (!empty($sqlBase)) {
                    DB::query('INSERT INTO "MyBaseDataObject" ("ID", "ClassName", "Title", "LastEdited", "Created") VALUES ' . implode(',', $sqlBase) . ';');
                    DB::query('INSERT INTO "MyDataObjectA" ("ID", "SomeField") VALUES ' . implode(',', $sqlA) . ';
                    ');
                }
                $diff -= $insert;
            }
        }
    }
}
<?php

use SilverStripe\CMS\Controllers\ContentController;
use SilverStripe\ORM\DB;

class PageController extends ContentController
{
    protected function init()
    {
        parent::init();
        // 1 million records 2.810 seconds though also 0.009 seconds after the first time
        // 2 million records 5.444 seconds
        // doesn't matter the number of enums, what matters is the number of records
        if (isset($_GET['timer'])) {
            $start = microtime(true);
            $sql = <<<EOT
            ALTER TABLE
                `MyBaseDataObject`
            MODIFY COLUMN
                `ClassName` enum(
                    'MyDataObjectA',
                    'SomethingElseA',
                    'SomethingElseB',
                    'SomethingElseC',
                    'SomethingElseD',
                    'SomethingElseE',
                    'SomethingElseF',
                    'SomethingElseG',
                    'SomethingElseH'
                )
            NOT NULL AFTER `ID`;
            EOT;
            DB::query($sql);
            $end = microtime(true);
            $time = $end - $start;
            // format to 3 decimal places
            $time = number_format($time, 3);
            echo $time;
            die;
        }
    }
}
emteknetnz commented 1 week ago

https://github.com/emteknetnz/silverstripe-dev-build-benchmark can be used to check how long different queries take to run when doing a dev/build, including during deployments. Works for both CMS 4 and 5

Tested on a deployment to a test environment where there was some though not much existing data on the server to Running the following script to show percentages of time spent on each query

<?php

$res = ['other' => 0];
$isHeader = true;
if (($handle = fopen("data.csv", "r")) !== FALSE) {
  while (($data = fgetcsv($handle, 1000, ",")) !== FALSE) {
    if ($isHeader) {
        $isHeader = false;
        continue;
    }
    [$id, $when, $time, $sql] = $data;
    $sql = strtolower($sql);
    $startsWiths = [
        'ALTER TABLE',
        'CHECK TABLE',
        'SELECT COUNT(*)',
        'SELECT DISTINCT',
        'SHOW FULL FIELDS IN',
        'SHOW FULL TABLES WHERE Table_Type',
        'SHOW INDEXES IN',
        'SHOW TABLE STATUS LIKE',
        'SHOW TABLES LIKE',
        'UPDATE',
    ];
    $matched = false;
    foreach ($startsWiths as $startsWith) {
        if (str_starts_with(strtolower($sql), strtolower($startsWith))) {
            $res[$startsWith] ??= 0;
            $res[$startsWith] += $time;
            $matched = true;
            continue 2;
        }
    }
    if (!$matched) {
        $res['other'] += $time;
    }
  }
  fclose($handle);
}
uksort($res, function ($a, $b) use ($res) {
    return $res[$b] <=> $res[$a];
});
$percs = [];
echo "\nTime in seconds:\n";
foreach ($res as $key => $val) {
    printf('%7.2f %s%s', $val, $key, PHP_EOL);
    $percs[$key] = $val / array_sum($res) * 100;
}
echo "\nPercentages:\n";
foreach ($percs as $key => $val) {
    printf('%6.1f%% %s%s', $val, $key, PHP_EOL);
}
Time in seconds:
  12.33 SHOW FULL TABLES WHERE Table_Type
   8.33 ALTER TABLE
   4.61 SHOW FULL FIELDS IN
   4.53 SHOW INDEXES IN
   3.47 SHOW TABLE STATUS LIKE
   3.18 other
   3.15 SHOW TABLES LIKE
   1.96 CHECK TABLE
   0.91 SELECT COUNT(*)
   0.66 SELECT DISTINCT
   0.29 UPDATE

Percentages:
  28.4% SHOW FULL TABLES WHERE Table_Type
  19.2% ALTER TABLE
  10.6% SHOW FULL FIELDS IN
  10.4% SHOW INDEXES IN
   8.0% SHOW TABLE STATUS LIKE
   7.3% other
   7.3% SHOW TABLES LIKE
   4.5% CHECK TABLE
   2.1% SELECT COUNT(*)
   1.5% SELECT DISTINCT
   0.7% UPDATE
GuySartorelli commented 4 days ago

PRs merged