pingcap / tidb-tools

tidb-tools are some useful tool collections for TiDB.
Apache License 2.0
286 stars 190 forks source link

diff: collation is applied incorrectly in ORDER BY when the primary key is composite #759

Open kennytm opened 8 months ago

kennytm commented 8 months ago

Bug Report

Please answer these questions before submitting your issue. Thanks!

  1. What did you do? If possible, provide a recipe for reproducing the error.
-- mysql @ 172.17.0.2:3306
create table t (a varchar(200) collate utf8mb4_general_ci not null, b varchar(200) collate utf8mb4_bin not null, primary key(a, b));
insert into t values ('a', 'aa'), ('B', 'bb'), ('c', 'cc'), ('D', 'dd'), ('e', 'ee'), ('F', 'ff'), ('g', 'gg'), ('H', 'hh'), ('*', 'mysql');

-- tidb @ 127.0.0.1:4000
create table t (a varchar(200) collate utf8mb4_bin not null, b varchar(200) collate utf8mb4_bin not null, primary key(a, b));
insert into t values ('a', 'aa'), ('B', 'bb'), ('c', 'cc'), ('D', 'dd'), ('e', 'ee'), ('F', 'ff'), ('g', 'gg'), ('H', 'hh'), ('*', 'tidb');
[data-sources.src1]
host = '172.17.0.2'
port = 3306
user = 'root'
password = '111'

[data-sources.src2]
host = '127.0.0.1'
port = 4000
user = 'root'
password = ''

[task]
output-dir = './output'
source-instances = ['src1']
target-instance = 'src2'
target-check-tables = ['test.t']
target-configs = ['cfg1']

[table-configs.cfg1]
target-tables = ['test.t']
collation = 'utf8mb4_general_ci'
bin/sync_diff_inspector -C cfg.toml
  1. What did you expect to see?

Only 1 row differs

+------------+---------+--------------------+----------------+---------+-----------+
|   TABLE    | RESULT  | STRUCTURE EQUALITY | DATA DIFF ROWS | UPCOUNT | DOWNCOUNT |
+------------+---------+--------------------+----------------+---------+-----------+
| `test`.`t` | succeed | true               | +1/-1          |       9 |         9 |
+------------+---------+--------------------+----------------+---------+-----------+
  1. What did you see instead?

5 row differs.

+------------+---------+--------------------+----------------+---------+-----------+
|   TABLE    | RESULT  | STRUCTURE EQUALITY | DATA DIFF ROWS | UPCOUNT | DOWNCOUNT |
+------------+---------+--------------------+----------------+---------+-----------+
| `test`.`t` | succeed | true               | +5/-5          |       9 |         9 |
+------------+---------+--------------------+----------------+---------+-----------+
  1. What version of TiDB are you using (tidb-server -V or run select tidb_version(); on TiDB)?

7.3.0

  1. which tool are you using?

sync-diff-inspector 7.1.3 (master version 8811722d71222462578c24da878a59ca46c8baaf)

  1. what versionof tool are you using (pump -V or tidb-lightning -V or syncer -V)?
kennytm commented 8 months ago

The collation setting is only applied to the end of ORDER BY, which only affects the last column with a composite PK (or no PK)

This patch should be applied (this is not the final correct solution because it will amplify #760):

diff --git a/pkg/diff/diff.go b/pkg/diff/diff.go
index 06242f9..b76021b 100644
--- a/pkg/diff/diff.go
+++ b/pkg/diff/diff.go
@@ -986,11 +986,11 @@ func getChunkRows(ctx context.Context, db *sql.DB, schema, table string, tableIn
    }

    for i, key := range orderKeys {
-       orderKeys[i] = dbutil.ColumnName(key)
+       orderKeys[i] = dbutil.ColumnName(key) + collation
    }

-   query := fmt.Sprintf("SELECT /*!40001 SQL_NO_CACHE */ %s FROM %s WHERE %s ORDER BY %s%s",
-       columns, dbutil.TableName(schema, table), where, strings.Join(orderKeys, ","), collation)
+   query := fmt.Sprintf("SELECT /*!40001 SQL_NO_CACHE */ %s FROM %s WHERE %s ORDER BY %s",
+       columns, dbutil.TableName(schema, table), where, strings.Join(orderKeys, ","))

    log.Debug("select data", zap.String("sql", query), zap.Reflect("args", args))
    rows, err := db.QueryContext(ctx, query, args...)
diff --git a/sync_diff_inspector/utils/utils.go b/sync_diff_inspector/utils/utils.go
index b8ca447..58c4748 100644
--- a/sync_diff_inspector/utils/utils.go
+++ b/sync_diff_inspector/utils/utils.go
@@ -163,11 +163,11 @@ func GetTableRowsQueryFormat(schema, table string, tableInfo *model.TableInfo, c
    }

    for i, key := range orderKeys {
-       orderKeys[i] = dbutil.ColumnName(key)
+       orderKeys[i] = dbutil.ColumnName(key) + collation
    }

-   query := fmt.Sprintf("SELECT /*!40001 SQL_NO_CACHE */ %s FROM %s WHERE %%s ORDER BY %s%s",
-       columns, dbutil.TableName(schema, table), strings.Join(orderKeys, ","), collation)
+   query := fmt.Sprintf("SELECT /*!40001 SQL_NO_CACHE */ %s FROM %s WHERE %%s ORDER BY %s",
+       columns, dbutil.TableName(schema, table), strings.Join(orderKeys, ","))

    return query, orderKeyCols
 }