duckdb / duckdb_mysql

https://duckdb.org/docs/extensions/mysql
MIT License
47 stars 11 forks source link

[Feature] Optimize performance for select count() for tables with primary key #13

Open thbley opened 10 months ago

thbley commented 10 months ago

What happens?

I tested duckdb v0.9.2 3c695d7ba9 with latest linux_amd64_gcc4-extensions (https://github.com/duckdb/duckdb_mysql/actions/runs/6863109179 linux_amd64_gcc4-extensions) and mysql 8.0.35.

Currently performance in duckdb is lower compared to mysql client. Query results are correct, utf8mb4 works!

I tested:

select * from mysql.t2 where c1 = 1000000;
select count(*) from mysql.t2;
select count(c1) from mysql.t2;
LOAD './mysql_scanner.duckdb_extension';
ATTACH 'host=127.0.0.1 user=root password=root port=3306 database=test' AS mysql (TYPE mysql_scanner);

D describe mysql.t2;
┌─────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│ column_name │ column_type │  null   │   key   │ default │ extra │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ c1          │ INTEGER     │ NO      │         │         │       │
│ c2          │ VARCHAR     │ YES     │         │         │       │
│ c3          │ TIMESTAMP   │ YES     │         │         │       │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴───────┘
Run Time (s): real 0.111 user 0.077912 sys 0.003893

D select * from mysql.t2 where c1 = 1000000;
100% ▕████████████████████████████████████████████████████████████▏ 
┌─────────┬──────────┬─────────────────────┐
│   c1    │    c2    │         c3          │
│  int32  │ varchar  │      timestamp      │
├─────────┼──────────┼─────────────────────┤
│ 1000000 │ öäüßвгдж │ 2023-11-15 02:47:57 │
└─────────┴──────────┴─────────────────────┘
Run Time (s): real 3.149 user 0.901230 sys 0.061170

D select * from mysql.t2 where c1 = 0;
100% ▕████████████████████████████████████████████████████████████▏ 
┌───────┬──────────────────────────────┬─────────────────────┐
│  c1   │              c2              │         c3          │
│ int32 │           varchar            │      timestamp      │
├───────┼──────────────────────────────┼─────────────────────┤
│     0 │ 😃🌀🌁🌂🌃🌄🌅🌆🌇🌈🌉🌊🌋🌌 │ 2023-11-15 03:12:51 │
└───────┴──────────────────────────────┴─────────────────────┘
Run Time (s): real 3.136 user 0.858150 sys 0.109318

select count(*) from mysql.t2;
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      1048569 │
└──────────────┘
Run Time (s): real 0.431 user 0.094944 sys 0.011416

D select count(c1) from mysql.t2;
┌───────────┐
│ count(c1) │
│   int64   │
├───────────┤
│   1048569 │
└───────────┘
Run Time (s): real 1.982 user 0.482755 sys 0.002354

D explain select * from mysql.t2 where c1 = 1000000;
┌─────────────────────────────┐
│┌───────────────────────────┐│
││       Physical Plan       ││
│└───────────────────────────┘│
└─────────────────────────────┘
┌───────────────────────────┐
│           FILTER          │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│       (c1 = 1000000)      │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│           EC: 1           │
└─────────────┬─────────────┘                             
┌─────────────┴─────────────┐
│        MYSQL_SCAN         │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             t2            │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│             c1            │
│             c2            │
│             c3            │
│   ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─   │
│           EC: 1           │
└───────────────────────────┘                             
Run Time (s): real 0.007 user 0.003512 sys 0.000372

with mysql client:

mysql> show create table t2\G
*************************** 1. row ***************************
       Table: t2
Create Table: CREATE TABLE `t2` (
  `c1` int NOT NULL,
  `c2` varchar(255) DEFAULT NULL,
  `c3` datetime DEFAULT NULL,
  PRIMARY KEY (`c1`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
1 row in set (0.00 sec)

mysql> select * from t2 where c1=1000000;
+---------+------------------+---------------------+
| c1      | c2               | c3                  |
+---------+------------------+---------------------+
| 1000000 | öäüßвгдж         | 2023-11-15 02:47:57 |
+---------+------------------+---------------------+
1 row in set (0.00 sec)

mysql> select count(*) from t2;
+----------+
| count(*) |
+----------+
|  1048569 |
+----------+
1 row in set (0.07 sec)

mysql> select count(c1) from t2;
+-----------+
| count(c1) |
+-----------+
|   1048569 |
+-----------+
1 row in set (0.07 sec)

select c1 from t2 USE INDEX () where c1 = 1000000;
+---------+
| c1      |
+---------+
| 1000000 |
+---------+
1 row in set (0.25 sec)

mysql> select * from t2 where c1 = 0;
+----+----------------------------------------------------------+---------------------+
| c1 | c2                                                       | c3                  |
+----+----------------------------------------------------------+---------------------+
|  0 | 😃🌀🌁🌂🌃🌄🌅🌆🌇🌈🌉🌊🌋🌌                                                         | 2023-11-15 03:12:51 |
+----+----------------------------------------------------------+---------------------+
1 row in set (0.00 sec)

mysql> explain select * from t2 where c1=1000000;
+----+-------------+-------+------------+-------+---------------+---------+---------+-------+------+----------+-------+
| id | select_type | table | partitions | type  | possible_keys | key     | key_len | ref   | rows | filtered | Extra |
+----+-------------+-------+------------+-------+---------------+---------+---------+-------+------+----------+-------+
|  1 | SIMPLE      | t2    | NULL       | const | PRIMARY       | PRIMARY | 4       | const |    1 |   100.00 | NULL  |
+----+-------------+-------+------------+-------+---------------+---------+---------+-------+------+----------+-------+
1 row in set, 1 warning (0.00 sec)

It would be great if performance could be optimized a bit for primary key selects!

Copying data from mysql to duckdb was very fast, much faster than copying inside of mysql!

D create table t4 (c1 int, c2 varchar(255), c3 datetime);
Run Time (s): real 0.009 user 0.000372 sys 0.000038
D insert into t4 select * from mysql.t2;
Run Time (s): real 3.933 user 2.549588 sys 0.162101
D select count(*) from t4;
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      1048570 │
└──────────────┘
Run Time (s): real 0.001 user 0.001245 sys 0.000123

mysql> create table t3 like t2;
Query OK, 0 rows affected (0.07 sec)
mysql> insert into t3 select * from t2;
Query OK, 1048570 rows affected (11.52 sec)
Records: 1048570  Duplicates: 0  Warnings: 0

To Reproduce

Create a table with 3 columns (int, varchar, datetime) and fill it with 1m rows. Execute queries to select from primary key and run a count(*).

OS:

Ubuntu 22.04.3

MySQL Version:

8.0.35

DuckDB Version:

0.9.2

DuckDB Client:

cli

Full Name:

Thomas Bley

Affiliation:

myself

Have you tried this on the latest main branch?

Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?

thbley commented 10 months ago

I found the problem for c1 = 1000000. Based on (https://github.com/duckdb/duckdb_mysql/commit/d391e5b3cd2cae6c084a31b26bc6a702c82ec7a0) I was missing:

SET GLOBAL mysql_experimental_filter_pushdown=true;

D select * from t1 where c1 = 1000000;
100% ┌─────────┬──────────────────────────────────────┬─────────────────────┐
│   c1    │                  c2                  │         c3          │
│  int32  │               varchar                │      timestamp      │
├─────────┼──────────────────────────────────────┼─────────────────────┤
│ 1000000 │ c37de652-83ad-11ee-b826-0242ac110002 │ 2023-11-15 11:54:41 │
└─────────┴──────────────────────────────────────┴─────────────────────┘
Run Time (s): real 30.895 user 11.832409 sys 1.670956

D SET GLOBAL mysql_experimental_filter_pushdown=true;

Run Time (s): real 0.000 user 0.000000 sys 0.000171
D select * from t1 where c1 = 1000000;
┌─────────┬──────────────────────────────────────┬─────────────────────┐
│   c1    │                  c2                  │         c3          │
│  int32  │               varchar                │      timestamp      │
├─────────┼──────────────────────────────────────┼─────────────────────┤
│ 1000000 │ c37de652-83ad-11ee-b826-0242ac110002 │ 2023-11-15 11:54:41 │
└─────────┴──────────────────────────────────────┴─────────────────────┘
Run Time (s): real 0.013 user 0.005777 sys 0.001366
thbley commented 10 months ago

It would be great if select count() could be optimized in the mysql extension if the table has a primary key.

e.g.

D select count() from t1;
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      8388608 │
└──────────────┘
Run Time (s): real 4.287 user 1.170847 sys 0.196719
D select count(c1) from t1;
┌───────────┐
│ count(c1) │
│   int64   │
├───────────┤
│   8388608 │
└───────────┘
Run Time (s): real 17.080 user 2.306154 sys 0.252885
D select count(*) from t1;
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      8388608 │
└──────────────┘
Run Time (s): real 4.438 user 0.941101 sys 0.221295
D select count(*) from t1 where c1 > 0;
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      8388608 │
└──────────────┘
Run Time (s): real 18.549 user 2.303993 sys 0.310117
D select count(*) from t1 where c1 != 0;
┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│      8388608 │
└──────────────┘
Run Time (s): real 16.512 user 2.107903 sys 0.258574

mysql> select count(*) from t1;
+----------+
| count(*) |
+----------+
|  8388608 |
+----------+
1 row in set (0.76 sec)

mysql> select count(c1) from t1;
+-----------+
| count(c1) |
+-----------+
|   8388608 |
+-----------+
1 row in set (0.77 sec)

mysql> select count(*) from t1 where c1 != 0;
+----------+
| count(*) |
+----------+
|  8388608 |
+----------+
1 row in set (4.45 sec)

mysql> explain select count(*) from t1;
+----+-------------+-------+------------+-------+---------------+---------+---------+------+---------+----------+-------------+
| id | select_type | table | partitions | type  | possible_keys | key     | key_len | ref  | rows    | filtered | Extra       |
+----+-------------+-------+------------+-------+---------------+---------+---------+------+---------+----------+-------------+
|  1 | SIMPLE      | t1    | NULL       | index | NULL          | PRIMARY | 4       | NULL | 8161825 |   100.00 | Using index |
+----+-------------+-------+------------+-------+---------------+---------+---------+------+---------+----------+-------------+
1 row in set, 1 warning (0.00 sec)