From ea082421db04b34b13786f7a3ea57578a784d6f8 Mon Sep 17 00:00:00 2001 From: sjaakola Date: Thu, 4 Jan 2024 10:10:51 +0200 Subject: [PATCH] MDEV-33175 applier FK check failure retrying Implementation of foreign key constraint check retrying for replication appliers Foreign key constraint checks may occaionally fail even though the constraints are not violated. There have been cases where e.g. the same transaction inserts in the parent table, and the next insert into child table fails in foreign key checks. The underlying reason for these failures is not known. This change adds retrying of constraint check in InnoDB. The number of retries is by default 1, and the number of retries can be controlled by a new system variable wsrep-applier-FK-failure-retries`. If the constraint check fails despite retries, the final retry prints out a warning with an error code and InnoDB system monitor output for further troubleshooting. Co-authored-by: Teemu Ollakka --- include/mysql/service_wsrep.h | 4 +- .../galera/r/galera_FK_failure_retry.result | 46 +++++++++++ .../suite/galera/r/galera_defaults.result | 3 +- .../galera/t/galera_FK_failure_retry.test | 77 +++++++++++++++++++ sql/sql_plugin_services.inl | 3 +- sql/sys_vars.cc | 6 ++ sql/wsrep_mysqld.cc | 6 ++ sql/wsrep_mysqld.h | 1 + storage/innobase/handler/ha_innodb.cc | 1 + storage/innobase/include/trx0trx.h | 5 ++ storage/innobase/row/row0ins.cc | 57 ++++++++++++++ 11 files changed, 206 insertions(+), 3 deletions(-) create mode 100644 mysql-test/suite/galera/r/galera_FK_failure_retry.result create mode 100644 mysql-test/suite/galera/t/galera_FK_failure_retry.test diff --git a/include/mysql/service_wsrep.h b/include/mysql/service_wsrep.h index e1f7994f7792f..2f799c8906654 100644 --- a/include/mysql/service_wsrep.h +++ b/include/mysql/service_wsrep.h @@ -89,6 +89,7 @@ extern struct wsrep_service_st { void (*wsrep_thd_kill_LOCK_func)(const MYSQL_THD thd); void (*wsrep_thd_kill_UNLOCK_func)(const MYSQL_THD thd); void (*wsrep_thd_set_wsrep_PA_unsafe_func)(MYSQL_THD thd); + uint (*wsrep_retry_FK_failure)(); } *wsrep_service; #define MYSQL_SERVICE_WSREP_INCLUDED @@ -133,8 +134,8 @@ extern struct wsrep_service_st { #define wsrep_thd_is_applying(T) wsrep_service->wsrep_thd_is_applying_func(T) #define wsrep_report_bf_lock_wait(T,I) wsrep_service->wsrep_report_bf_lock_wait(T,I) #define wsrep_thd_set_PA_unsafe(T) wsrep_service->wsrep_thd_set_PA_unsafe_func(T) +#define wsrep_retry_FK_failure() wsrep_service->wsrep_retry_FK_failure() #else - #define MYSQL_SERVICE_WSREP_STATIC_INCLUDED extern ulong wsrep_debug; extern my_bool wsrep_log_conflicts; @@ -232,5 +233,6 @@ extern "C" void wsrep_report_bf_lock_wait(const THD *thd, unsigned long long trx_id); /* declare parallel applying unsafety for the THD */ extern "C" void wsrep_thd_set_PA_unsafe(MYSQL_THD thd); +extern "C" uint wsrep_retry_FK_failure(); #endif #endif /* MYSQL_SERVICE_WSREP_INCLUDED */ diff --git a/mysql-test/suite/galera/r/galera_FK_failure_retry.result b/mysql-test/suite/galera/r/galera_FK_failure_retry.result new file mode 100644 index 0000000000000..44bd5a95f3df8 --- /dev/null +++ b/mysql-test/suite/galera/r/galera_FK_failure_retry.result @@ -0,0 +1,46 @@ +connection node_2; +connection node_1; +CREATE TABLE parent ( +id INT PRIMARY KEY, +j INT +) ENGINE=InnoDB; +CREATE TABLE child ( +id INT PRIMARY KEY AUTO_INCREMENT, +parent_id INT, +KEY (parent_id), +CONSTRAINT FOREIGN KEY (parent_id) REFERENCES parent(id) +) ENGINE=InnoDB; +connection node_1; +INSERT INTO parent VALUES (1,1); +connection node_2; +SET GLOBAL debug_dbug ='+d,wsrep_force_FK_check_fail'; +SET GLOBAL wsrep_slave_FK_checks = ON; +SET GLOBAL wsrep_applier_FK_failure_retries = 5; +SET wsrep_sync_wait = 15; +connection node_1; +INSERT INTO child VALUES (1,1); +connection node_2; +SELECT * from child; +id parent_id +1 1 +include/assert_grep.inc [warning for FK constraint failure exists] +SET GLOBAL wsrep_applier_FK_failure_retries = 1; +connection node_1; +INSERT INTO child VALUES (2,1); +SELECT * from child; +id parent_id +1 1 +2 1 +connection node_2; +select * from parent; +id j +1 1 +SET GLOBAL debug_dbug = NULL; +connection node_2; +SET GLOBAL wsrep_slave_FK_checks = DEFAULT; +SET GLOBAL wsrep_applier_FK_failure_retries = DEFAULT; +include/assert_grep.inc [warning for FK constraint failure missing] +CALL mtr.add_suppression("final FK constraint check failed for.*"); +connection node_1; +DROP TABLE child; +DROP TABLE parent; diff --git a/mysql-test/suite/galera/r/galera_defaults.result b/mysql-test/suite/galera/r/galera_defaults.result index 9a5c1e54b066c..63ee09361c357 100644 --- a/mysql-test/suite/galera/r/galera_defaults.result +++ b/mysql-test/suite/galera/r/galera_defaults.result @@ -3,7 +3,7 @@ connection node_1; # Correct Galera library found SELECT COUNT(*) `expect 49` FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%'; expect 49 -49 +50 SELECT VARIABLE_NAME, VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES WHERE VARIABLE_NAME LIKE 'wsrep_%' @@ -20,6 +20,7 @@ AND VARIABLE_NAME NOT IN ( ) ORDER BY VARIABLE_NAME; VARIABLE_NAME VARIABLE_VALUE +WSREP_APPLIER_FK_FAILURE_RETRIES 1 WSREP_AUTO_INCREMENT_CONTROL ON WSREP_CAUSAL_READS ON WSREP_CERTIFICATION_RULES strict diff --git a/mysql-test/suite/galera/t/galera_FK_failure_retry.test b/mysql-test/suite/galera/t/galera_FK_failure_retry.test new file mode 100644 index 0000000000000..a9450b7dd8ed5 --- /dev/null +++ b/mysql-test/suite/galera/t/galera_FK_failure_retry.test @@ -0,0 +1,77 @@ +--source include/galera_cluster.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/galera_have_debug_sync.inc + +CREATE TABLE parent ( + id INT PRIMARY KEY, + j INT +) ENGINE=InnoDB; + +CREATE TABLE child ( + id INT PRIMARY KEY AUTO_INCREMENT, + parent_id INT, + KEY (parent_id), + CONSTRAINT FOREIGN KEY (parent_id) REFERENCES parent(id) +) ENGINE=InnoDB; + +--connection node_1 +INSERT INTO parent VALUES (1,1); + +--connection node_2 +--let $wait_condition = SELECT COUNT(*) = 1 from parent; +--source include/wait_condition.inc +SET GLOBAL debug_dbug ='+d,wsrep_force_FK_check_fail'; +SET GLOBAL wsrep_slave_FK_checks = ON; +SET GLOBAL wsrep_applier_FK_failure_retries = 5; + +# make sure sync wait happens in this connection +SET wsrep_sync_wait = 15; + +--connection node_1 +INSERT INTO child VALUES (1,1); + +--connection node_2 +# Ensure that write succeeded in both nodes +SELECT * from child; + +--let $assert_select = final FK constraint check failed +--let $assert_count = 0 +--let $assert_text = warning for FK constraint failure exists +--let $assert_only_after = CURRENT_TEST +--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.2.err +--source include/assert_grep.inc + +# now setting retries low so that FK failure happens +SET GLOBAL wsrep_applier_FK_failure_retries = 1; + +--connection node_1 +INSERT INTO child VALUES (2,1); + +SELECT * from child; + +--connection node_2 +# must be careful to disable wsrep_force_FK_check_fail variable +# as it could happen concurrently with applier execution +# therefore making a non-related select which is waiting for sync wait +# +select * from parent; +SET GLOBAL debug_dbug = NULL; + +--connection node_2 +SET GLOBAL wsrep_slave_FK_checks = DEFAULT; +SET GLOBAL wsrep_applier_FK_failure_retries = DEFAULT; + +--let $assert_select = final FK constraint check failed +--let $assert_count = 1 +--let $assert_text = warning for FK constraint failure missing +--let $assert_only_after = CURRENT_TEST +--let $assert_file = $MYSQLTEST_VARDIR/log/mysqld.2.err +--source include/assert_grep.inc + + +CALL mtr.add_suppression("final FK constraint check failed for.*"); + +--connection node_1 +DROP TABLE child; +DROP TABLE parent; diff --git a/sql/sql_plugin_services.inl b/sql/sql_plugin_services.inl index f73c52f677ab4..5ee542cce4ab6 100644 --- a/sql/sql_plugin_services.inl +++ b/sql/sql_plugin_services.inl @@ -177,7 +177,8 @@ static struct wsrep_service_st wsrep_handler = { wsrep_report_bf_lock_wait, wsrep_thd_kill_LOCK, wsrep_thd_kill_UNLOCK, - wsrep_thd_set_PA_unsafe + wsrep_thd_set_PA_unsafe, + wsrep_retry_FK_failure }; static struct thd_specifics_service_st thd_specifics_handler= diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 921a557299106..0530c0031b8bb 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -5550,6 +5550,12 @@ static Sys_var_ulong Sys_wsrep_retry_autocommit( SESSION_VAR(wsrep_retry_autocommit), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 10000), DEFAULT(1), BLOCK_SIZE(1)); +static Sys_var_uint Sys_wsrep_applier_FK_failure_retries( + "wsrep_applier_FK_failure_retries", "Max number of times to retry " + "FK constraint check failure in applying", + GLOBAL_VAR(wsrep_applier_FK_failure_retries), CMD_LINE(OPT_ARG), + VALID_RANGE(0, 10000), DEFAULT(1), BLOCK_SIZE(1)); + static bool update_wsrep_auto_increment_control (sys_var *self, THD *thd, enum_var_type type) { if (wsrep_auto_increment_control) diff --git a/sql/wsrep_mysqld.cc b/sql/wsrep_mysqld.cc index 7c8bb683aab0d..7f3c8f86b0319 100644 --- a/sql/wsrep_mysqld.cc +++ b/sql/wsrep_mysqld.cc @@ -97,6 +97,7 @@ my_bool wsrep_log_conflicts; my_bool wsrep_load_data_splitting= 0; // Commit load data every 10K intervals my_bool wsrep_slave_UK_checks; // Slave thread does UK checks my_bool wsrep_slave_FK_checks; // Slave thread does FK checks +uint wsrep_applier_FK_failure_retries; // how many times FK failure to retry in applying my_bool wsrep_restart_slave; // Should mysql slave thread be // restarted, when node joins back? my_bool wsrep_desync; // De(re)synchronize the node from the @@ -2878,6 +2879,11 @@ int wsrep_thd_retry_counter(const THD *thd) return thd->wsrep_retry_counter; } +uint wsrep_retry_FK_failure() +{ + return wsrep_applier_FK_failure_retries; +} + extern bool wsrep_thd_ignore_table(THD *thd) { return thd->wsrep_ignore_table; diff --git a/sql/wsrep_mysqld.h b/sql/wsrep_mysqld.h index 921b75ae42d26..77b4a62fad4ab 100644 --- a/sql/wsrep_mysqld.h +++ b/sql/wsrep_mysqld.h @@ -90,6 +90,7 @@ extern bool wsrep_gtid_mode; extern uint32 wsrep_gtid_domain_id; extern std::atomic wsrep_thread_create_failed; +extern uint wsrep_applier_FK_failure_retries; enum enum_wsrep_reject_types { WSREP_REJECT_NONE, /* nothing rejected */ WSREP_REJECT_ALL, /* reject all queries, with UNKNOWN_COMMAND error */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 1f1810323338d..ddd4b09fe5622 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -2743,6 +2743,7 @@ innobase_trx_init( thd, OPTION_RELAXED_UNIQUE_CHECKS); #ifdef WITH_WSREP trx->wsrep = wsrep_on(thd); + trx->wsrep_is_BF = wsrep_thd_is_BF(thd, false); #endif DBUG_VOID_RETURN; diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h index 5574980f3ece4..344eb8cb740db 100644 --- a/storage/innobase/include/trx0trx.h +++ b/storage/innobase/include/trx0trx.h @@ -737,6 +737,11 @@ struct trx_t { /** whether wsrep_on(mysql_thd) held at the start of transaction */ bool wsrep; bool is_wsrep() const { return UNIV_UNLIKELY(wsrep); } + /* Boolean denoting whether to trx belongs to BF thread. This is + initialized at the beginning of the transaction, and the + value stays the same over whole transaction lifetime. */ + bool wsrep_is_BF; + /** true, if BF thread is performing unique secondary index scanning */ bool wsrep_UK_scan; bool is_wsrep_UK_scan() const { return UNIV_UNLIKELY(wsrep_UK_scan); } diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index 088981e5b5b08..a517c6c5ab18a 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -3221,6 +3221,32 @@ row_ins_sec_index_entry_low( mtr_commit(&mtr); DBUG_RETURN(err); } +#ifdef WITH_WSREP +static uint wsrep_retries(trx_t *trx, uint err, uint retries) { + + if (trx->wsrep_is_BF && err != DB_SUCCESS && err != DB_LOCK_WAIT) { + + ulint trx_start, trx_end= ULINT_UNDEFINED; + + if (retries > 0) { + retries--; + WSREP_DEBUG("FK constraint check retry for %d retries %d", + err, retries); + + if (wsrep_debug) + srv_printf_innodb_monitor(stderr, true, &trx_start, &trx_end); + + /* force warning messages, if final retry did not help */ + if (retries == 0) { + WSREP_WARN("final FK constraint check failed for %d ", err); + srv_printf_innodb_monitor(stderr, true, &trx_start, &trx_end); + } + } + } else retries = 0; + + return retries; +} +#endif /* WITH_WSREP */ /***************************************************************//** Inserts an entry into a clustered index. Tries first optimistic, @@ -3240,10 +3266,28 @@ row_ins_clust_index_entry( ulint n_uniq; DBUG_ENTER("row_ins_clust_index_entry"); +#ifdef WITH_WSREP + trx_t *trx= thr_get_trx(thr); + uint wsrep_FK_retries= (trx->wsrep_is_BF) ? wsrep_retry_FK_failure() : 0; + /* This counter is for galera_FK_failure_retry test, + DBUG_EXECUTE_IF("wsrep_force_FK_check_fail"...) below. */ + ut_d(uint fail_count=4); +#endif /* WITH_WSREP */ if (!index->table->foreign_set.empty()) { +#ifdef WITH_WSREP + do { +#endif /* WITH_WSREP */ err = row_ins_check_foreign_constraints( index->table, index, true, entry, thr); +#ifdef WITH_WSREP + DBUG_EXECUTE_IF("wsrep_force_FK_check_fail", + if (--fail_count) { err= DB_FAIL; } ); + wsrep_FK_retries= wsrep_retries(trx, err, wsrep_FK_retries); + } while(wsrep_FK_retries); +/* don't let the injected error to propagate and crash this node */ +DBUG_EXECUTE_IF("wsrep_force_FK_check_fail", err= DB_SUCCESS; ); +#endif /* WITH_WSREP */ if (err != DB_SUCCESS) { DBUG_RETURN(err); @@ -3335,10 +3379,23 @@ row_ins_sec_index_entry( DBUG_EXECUTE_IF("row_ins_sec_index_entry_timeout", { DBUG_SET("-d,row_ins_sec_index_entry_timeout"); return(DB_LOCK_WAIT);}); +#ifdef WITH_WSREP + trx_t *trx= thr_get_trx(thr); + uint wsrep_FK_retries= (trx->wsrep_is_BF) ? wsrep_retry_FK_failure() : 0; +#endif /* WITH_WSREP */ if (check_foreign && !index->table->foreign_set.empty()) { +#ifdef WITH_WSREP + do { +#endif /* WITH_WSREP */ err = row_ins_check_foreign_constraints(index->table, index, false, entry, thr); +#ifdef WITH_WSREP + wsrep_FK_retries= wsrep_retries(trx, err, wsrep_FK_retries); + } + while(wsrep_FK_retries); +#endif /* WITH_WSREP */ + if (err != DB_SUCCESS) { return(err);