Open glk1001 opened 3 years ago
Please provide a test case.
=====test_pglogical_forever.sh
#!/bin/bash -x
set -u
# This test fails after a few retries on a Centos 7 Vagrant box.
#
# Vagrant: 2.2.7
# VirtualBox: 6.1
# Postgres: 13.4
# Pglogical: postgresql13-pglogical.x86_64 2.4.0-1.el7
# Centos: 7.7.1908 (Core)
declare -r THIS_TEST_START_TIME=$(date +"%Y-%m-%d_%H-%M-%S")
for retry_num in $(seq 100); do
bash test_pglogical_setup_slave.sh "${THIS_TEST_START_TIME}" "${retry_num}"
if [[ $? != 0 ]]; then
echo "ERROR: Master/Slave setup failure."
exit 1
fi
bash test_pglogical.sh
if [[ $? != 0 ]]; then
echo "ERROR: Test failed after ${retry_num} tries."
exit 1
fi
done
=====
=====test_pglogical.sh
#!/bin/bash -x
set -u
set -e
declare -r POSTGRES_VERSION=13
declare -r SRCE_PORT=5340
declare -r DEST_PORT=5341
declare -r DEST_DATA_DIR="/var/lib/pgsql/${POSTGRES_VERSION}/slave"
declare -r DBNAME="test_db"
declare -r TABLE="test_table"
declare -r TOTAL_TIME_SECS=30
declare -r TRIGGER_FILE="/tmp/stop_inserting"
function create_test_database()
{
rm -f "${TRIGGER_FILE}"
psql -p ${SRCE_PORT} -c "DROP DATABASE IF EXISTS ${DBNAME}"
psql -p ${SRCE_PORT} -c "CREATE DATABASE ${DBNAME}"
psql -p ${SRCE_PORT} -d "${DBNAME}" -c "CREATE TABLE ${TABLE} (f1 int primary key, insert_time timestamp without time zone)"
psql -p ${SRCE_PORT} -d "${DBNAME}" -c "CREATE TABLE table1 (f1 int primary key, insert_time timestamp without time zone)"
}
function add_records_to_test_table()
{
local -r sleep_secs=0
rm -f "${TRIGGER_FILE}"
psql -p ${SRCE_PORT} -d "${DBNAME}" -c "DELETE FROM ${TABLE}"
echo "***START LOOP TO INSERT RECORDS INTO \"${DBNAME}\"..."
echo
local -r temp_sql_file="/tmp/test_pglogical.sql"
local -r start_time=$(date +%s)
local f1_val_done=0
local f1_val=1
while true; do
echo "" > "${temp_sql_file}"
for i in $(seq 2); do
f1_val_done=${f1_val}
echo "INSERT INTO ${TABLE} VALUES($f1_val, CURRENT_TIMESTAMP);" >> "${temp_sql_file}"
f1_val=$((f1_val + 1))
done
psql -p ${SRCE_PORT} -d "${DBNAME}" -f "${temp_sql_file}" &> /dev/null
if [[ -f "${TRIGGER_FILE}" ]]; then
echo
echo "*** FOUND TRIGGER FILE \"${TRIGGER_FILE}\". STOPPING NOW..."
break
fi
sleep ${sleep_secs}
time_now=$(date +%s)
if (( $((time_now - start_time)) > TOTAL_TIME_SECS )); then
break
fi
done
echo "*** STOPPED LOOP (PID ${BASHPID}) INSERTING RECORDS INTO \"${DBNAME}\". LAST RECORD WAS \"${f1_val_done}\"."
echo
return 0
}
function setup_master_node()
{
psql -p ${SRCE_PORT} -d ${DBNAME} -c "CREATE EXTENSION IF NOT EXISTS pglogical"
psql -p ${SRCE_PORT} -d ${DBNAME} -c "select pglogical.create_node(node_name := 'provider', dsn := 'host=localhost port=${SRCE_PORT} dbname=${DBNAME}')"
psql -p ${SRCE_PORT} -d ${DBNAME} -c "select pglogical.create_replication_set('replication_set')"
psql -p ${SRCE_PORT} -d ${DBNAME} -c "select pglogical.replication_set_add_table(set_name := 'replication_set', relation := '${TABLE}')"
echo
echo "*** Successfully setup pglogical master on port ${SRCE_PORT}."
echo
}
function setup_subscriber_node()
{
echo
echo "*** Stopping slave, then running pglogical_create_subscriber..."
echo
/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl stop -m fast -D ${DEST_DATA_DIR}
/usr/pgsql-${POSTGRES_VERSION}/bin/pglogical_create_subscriber -D ${DEST_DATA_DIR} \
--subscriber-name="subscription" \
--subscriber-dsn="host=localhost port=${DEST_PORT} dbname=${DBNAME}" \
--provider-dsn="host=localhost port=${SRCE_PORT} dbname=${DBNAME}" \
--replication-sets="ddl_sql,replication_set" \
-v --drop-slot-if-exists
echo
echo "*** Successfully started pglogical slave on port ${DEST_PORT}."
echo
}
create_test_database
add_records_to_test_table &
sleep 2
setup_master_node
setup_subscriber_node
touch "${TRIGGER_FILE}"
sleep 5
NUM_SRCE_RECS=$(psql -qt -p ${SRCE_PORT} -d "${DBNAME}" -c "SELECT COUNT(f1) FROM ${TABLE}")
NUM_DEST_RECS=$(psql -qt -p ${DEST_PORT} -d "${DBNAME}" -c "SELECT COUNT(f1) FROM ${TABLE}")
echo "NUM_SRCE_RECS = ${NUM_SRCE_RECS}"
echo "NUM_DEST_RECS = ${NUM_DEST_RECS}"
if (( NUM_SRCE_RECS != NUM_DEST_RECS )); then
echo "ERROR: srce/dest records mismatch"
exit 1
fi
======
=====test_pglogical_setup_slave.sh
#!/bin/bash -x
set -u
declare -r THIS_TEST_START_TIME=${1}
declare -r TEST_NUM=${2}
declare -r POSTGRES_VERSION=13
declare -r MASTER_PORT=5340
declare -r SLAVE_PORT=5341
declare -r MASTER_DIR="/var/lib/pgsql/${POSTGRES_VERSION}/master"
declare -r SLAVE_DIR="/var/lib/pgsql/${POSTGRES_VERSION}/slave"
/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl stop -m immediate -D ${SLAVE_DIR}
if pgrep -af "postgres .*slave" ; then
echo "ERROR: Could not stop slave."
exit 1
fi
rm -rf ${SLAVE_DIR}
if [[ -d ${SLAVE_DIR} ]]; then
echo "ERROR: Could not delete slave directory \"${SLAVE_DIR}\"."
exit 1
fi
/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl stop -m immediate -D ${MASTER_DIR}
if pgrep -af "postgres .*master" ; then
echo "ERROR: Could not stop master."
exit 1
fi
rm -rf ${MASTER_DIR}
if [[ -d ${MASTER_DIR} ]]; then
echo "ERROR: Could not delete master directory \"${MASTER_DIR}\"."
exit 1
fi
/usr/pgsql-${POSTGRES_VERSION}/bin/initdb -D ${MASTER_DIR} -E UTF8 --lc-collate=C --lc-ctype=C
if [[ $? != 0 ]]; then
echo "ERROR: Could not create master."
exit 1
fi
echo "port = ${MASTER_PORT}" >> ${MASTER_DIR}/postgresql.auto.conf
echo "shared_preload_libraries = 'pglogical'" >> ${MASTER_DIR}/postgresql.auto.conf
echo "wal_level = logical" >> ${MASTER_DIR}/postgresql.auto.conf
echo "max_worker_processes = 10" >> ${MASTER_DIR}/postgresql.auto.conf
echo "max_replication_slots = 10" >> ${MASTER_DIR}/postgresql.auto.conf
echo "max_wal_senders = 10" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_directory = '${PWD}/log'" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_filename = 'postgresql-${THIS_TEST_START_TIME}-test-${TEST_NUM}.log'" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_checkpoints = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_connections = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_disconnections = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_duration = off" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_min_duration_statement = 0" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_statement = all" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${MASTER_DIR}/postgresql.auto.conf
echo "log_line_prefix = '%m [%p] port=${MASTER_PORT} %q%u@%d/%a '" >> ${MASTER_DIR}/postgresql.auto.conf
echo "host replication postgres 127.0.0.1/32 trust" >> ${MASTER_DIR}/pg_hba.conf
echo "host test_db postgres 127.0.0.1/32 trust" >> ${MASTER_DIR}/pg_hba.conf
echo "host test_db postgres 127.0.0.1/32 trust" >> ${MASTER_DIR}/pg_hba.conf
/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl start -D ${MASTER_DIR}
if [[ $? != 0 ]]; then
echo "ERROR: Could not start master."
exit 1
fi
pg_basebackup -h localhost -U postgres -p ${MASTER_PORT} -D ${SLAVE_DIR} -P -Xs -R
if [[ $? != 0 ]]; then
echo "ERROR: Could not create slave."
exit 1
fi
echo "port = ${SLAVE_PORT}" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "archive_mode = off" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "hot_standby = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "shared_preload_libraries = 'pglogical'" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_directory = '${PWD}/log'" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_filename = 'postgresql-${THIS_TEST_START_TIME}-test-${TEST_NUM}.log'" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_checkpoints = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_connections = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_disconnections = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_duration = off" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_min_duration_statement = 0" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_statement = all" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_replication_commands = on" >> ${SLAVE_DIR}/postgresql.auto.conf
echo "log_line_prefix = '%m [%p] port=${SLAVE_PORT} %q%u@%d/%a '" >> ${SLAVE_DIR}/postgresql.auto.conf
/usr/pgsql-${POSTGRES_VERSION}/bin/pg_ctl start -D ${SLAVE_DIR}
if [[ $? != 0 ]]; then
echo "ERROR: Could not start slave."
exit 1
fi
=====
I did get a failure on a physical machine after about 3000 retries.
Hi, I've looked into this a bit more and it's failing because of this code sequence:
In
void pglogical_apply_main(Datum main_arg)
there is a call to
pglogical_sync_subscription(MySubscription);
Further in this gets called:
PGLogicalSyncStatus * get_subscription_sync_status(Oid subid, bool missing_ok)
...
rv = makeRangeVar(EXTENSION_NAME, CATALOG_LOCAL_SYNC_STATUS, -1);
rel = table_openrv(rv, RowExclusiveLock);
tupDesc = RelationGetDescr(rel);
The problem is here - sometimes the return for sync status is 'i' instead of 'r'. This happens even though the 'pglogical_create_subscriber' program sets sync_status = 'r' just after setting up a subscription. I can make the test script pass by forcing sync status = 'r' to be returned from 'get_subscription_sync_status'. Not sure why 'table_openrv' is not always getting 'r'.
@glk1001 we also met the same issue, could you share how you fix it? Or workaround?
We have a test harness that checks no records are lost when using 'pglogical_create_subscriber' to convert a streaming slave to a logical slave. This was working fine in Postgres 10 but we're just moving to Postgres 13 and this test harness is failing. Basically, the test has a separate process updating records on the master at the same time as 'pglogical_create_subscriber' is converting the streaming, and a comparison of the records at the end. Not sure where to look to solve this but I can see that when the conversion is not successful, the Postgres log on the master instance has entries with 'app' = '..._snap' including a line with 'SELECT pg_drop_replication_slot($1)'. These are not there when the test passes - if that's a clue. I can dig deeper if you can give me some suggestions or I could put more time into a simpler test.