reorg / pg_repack

Reorganize tables in PostgreSQL databases with minimal locks
BSD 3-Clause "New" or "Revised" License
1.8k stars 166 forks source link

Segmentation fault in Postgres 16 #407

Open nkPit opened 2 days ago

nkPit commented 2 days ago

Hello! I get a Segmentation fault when trying to execute pg_repack --dbname=contrib_regression The backtrace is very large:

Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Core was generated by `pg_repack --dbname=contrib_regression'.
Program terminated with signal SIGSEGV, Segmentation fault.

warning: Section `.reg-xstate/173312' in core file too small.
#0  0x00007fb5adc91224 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
(gdb) bt
#0  0x00007fb5adc91224 in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007fb5adc8f78c in __nss_passwd_lookup2 () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007fb5adc24cf5 in getpwuid_r () from /lib/x86_64-linux-gnu/libc.so.6
#3  0x00007fb5add97148 in pg_get_user_name (user_id=1000, buffer=0x7ffc4eb89210 "", buflen=8192) at thread.c:42
#4  0x00007fb5add8d560 in pg_fe_getusername (user_id=1000, errorMessage=errorMessage@entry=0x0) at fe-auth.c:1200
#5  0x00007fb5add8d5f8 in pg_fe_getauthname (errorMessage=errorMessage@entry=0x0) at fe-auth.c:1231
#6  0x00007fb5add7b79a in conninfo_add_defaults (options=options@entry=0x55a649b2fe50, errorMessage=errorMessage@entry=0x55a649b2e978)
    at fe-connect.c:6375
#7  0x00007fb5add7dcfe in conninfo_array_parse (use_defaults=true, expand_dbname=1, errorMessage=0x55a649b2e978,
    values=0x7ffc4eb8b380, keywords=0x7ffc4eb8b350) at fe-connect.c:6253
#8  PQconnectStartParams (keywords=keywords@entry=0x7ffc4eb8b350, values=values@entry=0x7ffc4eb8b380,
    expand_dbname=expand_dbname@entry=1) at fe-connect.c:799
#9  0x00007fb5add7de3b in PQconnectdbParams (keywords=keywords@entry=0x7ffc4eb8b350, values=values@entry=0x7ffc4eb8b380,
    expand_dbname=expand_dbname@entry=1) at fe-connect.c:682
#10 0x000055a649af12e0 in pgut_connect (dbname=0x55a649b24930 "contrib_regression", host=0x0, port=0x0, username=0x0, password=0x0,
    prompt=DEFAULT, elevel=21) at pgut/pgut.c:532
#11 0x000055a649af214b in reconnect (elevel=elevel@entry=21) at pgut/pgut-fe.c:164
#12 0x000055a649aec8b8 in repack_cleanup_callback (fatal=<optimized out>, userdata=0x7ffc4f385950) at pg_repack.c:2021
#13 0x000055a649aef578 in call_atexit_callbacks (fatal=true) at pgut/pgut.c:1240
#14 0x000055a649af0091 in exit_or_abort (elevel=<optimized out>, exitcode=-2) at pgut/pgut.c:1259
#15 pgut_errfinish (dummy=dummy@entry=0) at pgut/pgut.c:930
#16 0x000055a649af0a2d in pgut_execute_elevel (conn=conn@entry=0x55a649b2e590, query=query@entry=0x55a649afbc1f "RESET lock_timeout",
    nParams=nParams@entry=0, params=params@entry=0x0, elevel=elevel@entry=21) at pgut/pgut.c:1042
#17 0x000055a649af0a96 in pgut_execute (params=0x0, nParams=0, query=0x55a649afbc1f "RESET lock_timeout", conn=0x55a649b2e590)
    at pgut/pgut.c:626
#18 pgut_command (conn=conn@entry=0x55a649b2e590, query=query@entry=0x55a649afbc1f "RESET lock_timeout", nParams=nParams@entry=0,
    params=params@entry=0x0) at pgut/pgut.c:688
#19 0x000055a649aec721 in lock_exclusive (conn=0x55a649b2e590, relid=<optimized out>,
    lock_query=0x55a649b39c87 "LOCK TABLE 16460 IN ACCESS EXCLUSIVE MODE", start_xact=start_xact@entry=false) at pg_repack.c:1996
#20 0x000055a649aec8de in repack_cleanup_callback (fatal=<optimized out>, userdata=0x7ffc4f385950) at pg_repack.c:2024

...

#49793 0x000055a649af0a96 in pgut_execute (params=0x0, nParams=0, query=0x55a649afbc1f "RESET lock_timeout", conn=0x55a649b2e590) at pgut/pgut.c:626
#49794 pgut_command (conn=conn@entry=0x55a649b2e590, query=query@entry=0x55a649afbc1f "RESET lock_timeout", nParams=nParams@entry=0, params=params@entry=0x0) at pgut/pgut.c:688
#49795 0x000055a649aec721 in lock_exclusive (conn=0x55a649b2e590, relid=<optimized out>, lock_query=0x55a649b39c87 "LOCK TABLE 16460 IN ACCESS EXCLUSIVE MODE", start_xact=start_xact@entry=false) at pg_repack.c:1996
#49796 0x000055a649aec8de in repack_cleanup_callback (fatal=<optimized out>, userdata=0x7ffc4f385950) at pg_repack.c:2024
#49797 0x000055a649aef578 in call_atexit_callbacks (fatal=true) at pgut/pgut.c:1240
#49798 0x000055a649af0091 in exit_or_abort (elevel=<optimized out>, exitcode=-2) at pgut/pgut.c:1259
#49799 pgut_errfinish (dummy=dummy@entry=0) at pgut/pgut.c:930
#49800 0x000055a649af0a2d in pgut_execute_elevel (conn=conn@entry=0x55a649b24a80, query=query@entry=0x55a649afbc1f "RESET lock_timeout", nParams=nParams@entry=0, params=params@entry=0x0, elevel=elevel@entry=21) at pgut/pgut.c:1042
#49801 0x000055a649af0a96 in pgut_execute (params=0x0, nParams=0, query=0x55a649afbc1f "RESET lock_timeout", conn=0x55a649b24a80) at pgut/pgut.c:626
#49802 pgut_command (conn=conn@entry=0x55a649b24a80, query=query@entry=0x55a649afbc1f "RESET lock_timeout", nParams=nParams@entry=0, params=params@entry=0x0) at pgut/pgut.c:688
#49803 0x000055a649aec721 in lock_exclusive (conn=0x55a649b24a80, relid=<optimized out>, relid@entry=0x7ffc4f385a38 "16460", lock_query=0x55a649b39c87 "LOCK TABLE 16460 IN ACCESS EXCLUSIVE MODE", start_xact=start_xact@entry=true) at pg_repack.c:1996
#49804 0x000055a649aede8e in repack_one_table (orderby=0x0, table=0x7ffc4f385950) at pg_repack.c:1295
#49805 repack_one_database (orderby=0x0, errbuf=errbuf@entry=0x7ffc4f385ba0 "", errsize=256) at pg_repack.c:970
#49806 0x000055a649aebbb2 in main (argc=2, argv=<optimized out>) at pg_repack.c:400

To reproduce the issue, four files are needed:

cat ./query-2024-06-19T20-04-16.581242.sql

CREATE TABLE testts1 (id serial primary key, data text);
CREATE INDEX testts1_partial_idx on testts1 (id) where (id > 0);
\! pg_repack --dbname=contrib_regression --no-order --table=testts1

SELECT relname, spcname

SELECT relname, spcname
FROM pg_class JOIN pg_tablespace ts ON ts.oid = reltablespace
cat ./query-2024-06-19T20-04-58.303150.sql

SET client_min_messages = warning;
CREATE EXTENSION pg_repack;
RESET client_min_messages;
cat ./query-2024-06-19T20-04-59.492705.sql

\! pg_repack --dbname=contrib_regression --table=tbl_cluster
\! pg_repack --dbnamecontrib_regression --table=tbl_badindex
\! pg_repack --dbname=contrib_regression
cat ./query-2024-06-19T20-05-15.647083.sql

CREATE TABLE tbl_nn    (col1 int NOT NULL, col2 int NOT NULL);
CREATE TABLE tbl_uk    (col1 int NOT NULL, col2 int         , UNIQUE(col1, col2));
CREATE TABLE tbl_nn_uk (col1 int NOT NULL, col2 int NOT NULL, UNIQUE(col1, col2));
CREATE TABLE tbl_pk_uk (col1 int NOT NULL, col2 int NOT NULL, PRIMARY KEY(col1, col2), UNIQUE(col2, col1));
CREATE TABLE tbl_nn_puk (col1 int NOT NULL, col2 int NOT NULL);

The reproduction script itself:

#!/bin/bash
killall postgres || true
sleep 1
rm -rf ./data || true
rm logfile || true
initdb -D data
pg_ctl -D data -l logfile start
dropdb contrib_regression &> /dev/null || true
createdb contrib_regression
psql -c "CREATE EXTENSION pg_repack" -d contrib_regression

while read -r line;
do
    echo " $i processed $line"
    timeout 300 psql -f $line contrib_regression &> /dev/null &
done < files
wait
cat files
./query-2024-06-19T20-04-16.581242.sql
./query-2024-06-19T20-04-58.303150.sql
./query-2024-06-19T20-04-59.492705.sql
./query-2024-06-19T20-05-15.647083.sql

for i in $(seq 20) ; do ~/run_test ; if coredumpctl ; then break; fi ; done

Postgres version:

postgres=# select version();
                                                version
-------------------------------------------------------------------------------------------------------
 PostgreSQL 16.3 on x86_64-pc-linux-gnu, compiled by gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0, 64-bit
(1 row)

pg_repack version: pg_repack | 1.5.1 | public | Reorganize tables in PostgreSQL databases with minimal locks

pg_repack --version
pg_repack 1.5.1

The system on which it was run:

uname -a
Linux ubuntu-2204 5.15.0-107-generic #117-Ubuntu SMP Fri Apr 26 12:26:49 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux

Best regards, Nikita Kalinin Postgres Professional

Melkij commented 2 days ago

Hmm, I want to note that we did not have a 1.5.1 release yet. Are you testing on the current HEAD or on some previous commit?

nkPit commented 2 days ago

Yes. In this case, I'm working with HEAD. But this problem is also present on 1.5.0

za-arthur commented 2 days ago

Unfortunately I couldn't reproduce the issue using your script using pg_repack 1.5.1 and Postgres 16.2 (although I may recheck it with Postgres 16.3).

According to the dump, the segfault happens within PQconnectdbParams() which wasn't used on 1.5.0, it was introduced only by 1.5.1: https://github.com/Melkij/pg_repack/blob/f3d47e5108e5a3dd89e68c48bc659c94713a1056/doc/pg_repack.rst#releases

za-arthur commented 2 days ago

Unfortunately I couldn't reproduce the issue using your script using pg_repack 1.5.1 and Postgres 16.2 (although I may recheck it with Postgres 16.3).

I tried to reproduce the issue with Postgres 16.3 and I cannot reproduce it neither.

@nkPit do you use Postgres instance provided by a Ubuntu package? Or did you build it by yourself?

za-arthur commented 2 days ago

49802 pgut_command (conn=conn@entry=0x55a649b24a80, query=query@entry=0x55a649afbc1f "RESET lock_timeout", nParams=nParams@entry=0, params=params@entry=0x0) at pgut/pgut.c:688

Additionally it seems that pgut_execute() failed during calling RESET lock_timeout within lock_exclusive(). Can you share that error?

nkPit commented 2 days ago

do you use Postgres instance provided by a Ubuntu package? Or did you build it by yourself?

Postgres was built by myself. Config:

./configure --enable-tap-tests --enable-debuge --enable-cassert && make -j8 && make check

test@ubuntu-2204:~/test/postgres$ git branch
* REL_16_STABLE
  master
git log
commit 7857c8dc4481970c27a44a4d1073064ff674b1de (HEAD -> REL_16_STABLE, origin/REL_16_STABLE)
nkPit commented 2 days ago

Can you share that error?

I'm not quite sure what I have to show here. The only thing I have is backtrace.

za-arthur commented 2 days ago

Postgres was built by myself.

Can you try to rebuild your Postgres by running the command make distclean && make && make install to make sure that you don't have any artefacts from previous build?

I'm not quite sure what I have to show here. The only thing I have is backtrace.

Your PostgreSQL logs might contain any errors which was raised during running pg_repack. Could you check it and provide it if there is any relevant error?

nkPit commented 2 days ago

After make install

tail -n 20 logfile
2024-07-01 12:58:53.135 UTC [335388] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-01 12:58:53.135 UTC [335388] STATEMENT:  RESET lock_timeout
2024-07-01 12:58:53.138 UTC [335390] ERROR:  syntax error at or near "16460" at character 12
2024-07-01 12:58:53.138 UTC [335390] STATEMENT:  LOCK TABLE 16460 IN ACCESS EXCLUSIVE MODE
2024-07-01 12:58:53.138 UTC [335390] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-01 12:58:53.138 UTC [335390] STATEMENT:  RESET lock_timeout
2024-07-01 12:58:53.141 UTC [335392] ERROR:  syntax error at or near "16460" at character 12
2024-07-01 12:58:53.141 UTC [335392] STATEMENT:  LOCK TABLE 16460 IN ACCESS EXCLUSIVE MODE
2024-07-01 12:58:53.141 UTC [335392] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-01 12:58:53.141 UTC [335392] STATEMENT:  RESET lock_timeout
2024-07-01 12:58:53.144 UTC [335394] ERROR:  syntax error at or near "16460" at character 12
2024-07-01 12:58:53.144 UTC [335394] STATEMENT:  LOCK TABLE 16460 IN ACCESS EXCLUSIVE MODE
2024-07-01 12:58:53.145 UTC [335394] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-01 12:58:53.145 UTC [335394] STATEMENT:  RESET lock_timeout
2024-07-01 12:58:55.804 UTC [322919] LOG:  received smart shutdown request
2024-07-01 12:58:55.806 UTC [322919] LOG:  background worker "logical replication launcher" (PID 322925) exited with exit code 1
2024-07-01 12:58:55.806 UTC [322920] LOG:  shutting down
2024-07-01 12:58:55.807 UTC [322920] LOG:  checkpoint starting: shutdown immediate
2024-07-01 12:58:55.856 UTC [322920] LOG:  checkpoint complete: wrote 943 buffers (5.8%); 0 WAL file(s) added, 0 removed, 0 recycled; write=0.020 s, sync=0.025 s, total=0.050 s; sync files=326, longest=0.001 s, average=0.001 s; distance=4442 kB, estimate=4442 kB; lsn=0/193DBA0, redo lsn=0/193DBA0
2024-07-01 12:58:55.862 UTC [322919] LOG:  database system is shut down
za-arthur commented 2 days ago

2024-07-01 12:58:53.138 UTC [335390] ERROR: syntax error at or near "16460" at character 12 2024-07-01 12:58:53.138 UTC [335390] STATEMENT: LOCK TABLE 16460 IN ACCESS EXCLUSIVE MODE

Can you show output of the query:

select relname, relid, lock_table from repack.tables;
nkPit commented 1 day ago

Reinstalled postgres and pg_repack. The log has changed a bit.

contrib_regression=# select relname, relid, lock_table from repack.tables;
      relname      | relid |                      lock_table
-------------------+-------+-------------------------------------------------------
 public.tbl_nn     | 16423 | LOCK TABLE public.tbl_nn IN ACCESS EXCLUSIVE MODE
 public.tbl_uk     | 16433 | LOCK TABLE public.tbl_uk IN ACCESS EXCLUSIVE MODE
 public.tbl_nn_puk | 16453 | LOCK TABLE public.tbl_nn_puk IN ACCESS EXCLUSIVE MODE
 public.tbl_nn_uk  | 16441 | LOCK TABLE public.tbl_nn_uk IN ACCESS EXCLUSIVE MODE
 public.tbl_pk_uk  | 16446 | LOCK TABLE public.tbl_pk_uk IN ACCESS EXCLUSIVE MODE
 public.testts1    | 16427 | LOCK TABLE public.testts1 IN ACCESS EXCLUSIVE MODE
(6 rows)
test@ubuntu-2204:~/test$ tail -n 20 logfile
2024-07-02 05:57:17.491 UTC [520854] ERROR:  relation "repack.log_16427" does not exist
2024-07-02 05:57:17.491 UTC [520854] STATEMENT:  LOCK TABLE repack.log_16427 IN ACCESS EXCLUSIVE MODE
2024-07-02 05:57:17.491 UTC [520854] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-02 05:57:17.491 UTC [520854] STATEMENT:  RESET lock_timeout
2024-07-02 05:57:17.495 UTC [520856] ERROR:  relation "repack.log_16427" does not exist
2024-07-02 05:57:17.495 UTC [520856] STATEMENT:  LOCK TABLE repack.log_16427 IN ACCESS EXCLUSIVE MODE
2024-07-02 05:57:17.495 UTC [520856] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-02 05:57:17.495 UTC [520856] STATEMENT:  RESET lock_timeout
2024-07-02 05:57:17.499 UTC [520858] ERROR:  relation "repack.log_16427" does not exist
2024-07-02 05:57:17.499 UTC [520858] STATEMENT:  LOCK TABLE repack.log_16427 IN ACCESS EXCLUSIVE MODE
2024-07-02 05:57:17.499 UTC [520858] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-02 05:57:17.499 UTC [520858] STATEMENT:  RESET lock_timeout
2024-07-02 05:57:17.503 UTC [520860] ERROR:  relation "repack.log_16427" does not exist
2024-07-02 05:57:17.503 UTC [520860] STATEMENT:  LOCK TABLE repack.log_16427 IN ACCESS EXCLUSIVE MODE
2024-07-02 05:57:17.503 UTC [520860] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-02 05:57:17.503 UTC [520860] STATEMENT:  RESET lock_timeout
2024-07-02 05:57:17.507 UTC [520862] ERROR:  relation "repack.log_16427" does not exist
2024-07-02 05:57:17.507 UTC [520862] STATEMENT:  LOCK TABLE repack.log_16427 IN ACCESS EXCLUSIVE MODE
2024-07-02 05:57:17.507 UTC [520862] ERROR:  current transaction is aborted, commands ignored until end of transaction block
2024-07-02 05:57:17.507 UTC [520862] STATEMENT:  RESET lock_timeout