diff options
| author | Lorry Tar Creator <lorry-tar-importer@baserock.org> | 2015-02-17 17:25:57 +0000 |
|---|---|---|
| committer | <> | 2015-03-17 16:26:24 +0000 |
| commit | 780b92ada9afcf1d58085a83a0b9e6bc982203d1 (patch) | |
| tree | 598f8b9fa431b228d29897e798de4ac0c1d3d970 /src/repmgr/repmgr_msg.c | |
| parent | 7a2660ba9cc2dc03a69ddfcfd95369395cc87444 (diff) | |
| download | berkeleydb-master.tar.gz | |
Diffstat (limited to 'src/repmgr/repmgr_msg.c')
| -rw-r--r-- | src/repmgr/repmgr_msg.c | 655 |
1 files changed, 602 insertions, 53 deletions
diff --git a/src/repmgr/repmgr_msg.c b/src/repmgr/repmgr_msg.c index 13537823..71cb2ada 100644 --- a/src/repmgr/repmgr_msg.c +++ b/src/repmgr/repmgr_msg.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -15,15 +15,19 @@ #include "dbinc_auto/repmgr_auto.h" static int dispatch_app_message __P((ENV *, REPMGR_MESSAGE *)); -static int finish_gmdb_update __P((ENV *, - DB_THREAD_INFO *, DBT *, u_int32_t, u_int32_t, __repmgr_member_args *)); +static int finish_gmdb_update __P((ENV *, DB_THREAD_INFO *, + DBT *, u_int32_t, u_int32_t, u_int32_t, __repmgr_member_args *)); static int incr_gm_version __P((ENV *, DB_THREAD_INFO *, DB_TXN *)); -static void marshal_site_data __P((ENV *, u_int32_t, u_int8_t *, DBT *)); +static void marshal_site_data __P((ENV *, + u_int32_t, u_int32_t, u_int8_t *, DBT *)); static void marshal_site_key __P((ENV *, repmgr_netaddr_t *, u_int8_t *, DBT *, __repmgr_member_args *)); static int message_loop __P((ENV *, REPMGR_RUNNABLE *)); +static int preferred_master_takeover __P((ENV*)); static int process_message __P((ENV*, DBT*, DBT*, int)); static int reject_fwd __P((ENV *, REPMGR_CONNECTION *)); +static int rejoin_connections(ENV *); +static int rejoin_deferred_election(ENV *); static int rescind_pending __P((ENV *, DB_THREAD_INFO *, int, u_int32_t, u_int32_t)); static int resolve_limbo_int __P((ENV *, DB_THREAD_INFO *)); @@ -33,9 +37,13 @@ static int send_permlsn_conn __P((ENV *, REPMGR_CONNECTION *, u_int32_t, DB_LSN *)); static int serve_join_request __P((ENV *, DB_THREAD_INFO *, REPMGR_MESSAGE *)); +static int serve_lsnhist_request __P((ENV *, DB_THREAD_INFO *, + REPMGR_MESSAGE *)); +static int serve_readonly_master_request __P((ENV *, REPMGR_MESSAGE *)); static int serve_remove_request __P((ENV *, DB_THREAD_INFO *, REPMGR_MESSAGE *)); static int serve_repmgr_request __P((ENV *, REPMGR_MESSAGE *)); +static int serve_restart_client_request __P((ENV *, REPMGR_MESSAGE *)); /* * Map one of the phase-1/provisional membership status values to its @@ -72,6 +80,7 @@ message_loop(env, th) REPMGR_RUNNABLE *th; { DB_REP *db_rep; + DB_THREAD_INFO *ip; REP *rep; REPMGR_MESSAGE *msg; REPMGR_CONNECTION *conn; @@ -83,6 +92,7 @@ message_loop(env, th) COMPQUIET(membership, 0); db_rep = env->rep_handle; rep = db_rep->region; + ENV_ENTER(env, ip); LOCK_MUTEX(db_rep->mutex); while ((ret = __repmgr_queue_get(env, &msg, th)) == 0) { incremented = FALSE; @@ -141,7 +151,21 @@ message_loop(env, th) * detect it without the need for application * activity. */ - ret = __rep_flush(env->dbenv); + ret = __rep_flush_int(env); + } else if (db_rep->prefmas_pending == master_switch && + IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER) && + F_ISSET(rep, REP_F_CLIENT)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, +"message_loop heartbeat preferred master switch")); + /* + * We are a preferred master site currently + * running as a client and we have finished + * syncing with the temporary master. It is + * now time to take over as master. + */ + db_rep->prefmas_pending = no_action; + ret = preferred_master_takeover(env); } else { /* * Use heartbeat message to initiate rerequest @@ -162,6 +186,12 @@ message_loop(env, th) db_rep->non_rep_th--; if (ret != 0) goto out; + if (db_rep->view_mismatch) { + __db_errx(env, DB_STR("3699", + "Site is not recorded as a view in the group membership database")); + ret = EINVAL; + goto out; + } } /* * A return of DB_REP_UNAVAIL from __repmgr_queue_get() merely means we @@ -171,6 +201,7 @@ message_loop(env, th) ret = 0; out: UNLOCK_MUTEX(db_rep->mutex); + ENV_LEAVE(env, ip); return (ret); } @@ -341,16 +372,45 @@ process_message(env, control, rec, eid) break; case DB_REP_DUPMASTER: - /* - * Initiate an election if we're configured to be using - * elections, but only if we're *NOT* using leases. When using - * leases, there is never any uncertainty over which site is the - * rightful master, and only the loser gets the DUPMASTER return - * code. - */ - if ((ret = __repmgr_become_client(env)) == 0 && + if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) { + /* + * The preferred master site must restart as a master + * so that it sends out a NEWMASTER to help the client + * sync. It must force a role change so that it + * advances its gen even though it is already master. + * This is needed if there was a temporary master at + * a higher gen that is now restarting as a client. + * A client won't process messages from a master at + * a lower gen than its own. + */ + ret = __repmgr_repstart(env, DB_REP_MASTER, + REP_START_FORCE_ROLECHG); + } else if (IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_CLIENT) && + (ret = __repmgr_become_client(env)) == 0) { + /* + * The preferred master client site must restart as + * client without any elections to enable the preferred + * master site to preserve its own transactions. It + * uses an election thread to repeatedly perform client + * startups so that it will perform its client sync + * when the preferred master's gen has caught up. + */ + LOCK_MUTEX(db_rep->mutex); + ret = __repmgr_init_election(env, + ELECT_F_CLIENT_RESTART); + UNLOCK_MUTEX(db_rep->mutex); + } else if ((ret = __repmgr_become_client(env)) == 0 && FLD_ISSET(rep->config, REP_C_LEASE | REP_C_ELECTIONS) == REP_C_ELECTIONS) { + /* + * Initiate an election if we're configured to be using + * elections, but only if we're *NOT* using leases. + * When using leases, there is never any uncertainty + * over which site is the rightful master, and only the + * loser gets the DUPMASTER return code. + */ LOCK_MUTEX(db_rep->mutex); ret = __repmgr_init_election(env, ELECT_F_IMMED); UNLOCK_MUTEX(db_rep->mutex); @@ -406,6 +466,14 @@ DB_TEST_RECOVERY_LABEL t_ret = __op_rep_exit(env); if (ret == ENOENT) ret = 0; + else if (ret == DB_DELETED && db_rep->demotion_pending) + /* + * If a demotion is in progress, we want to keep + * the repmgr threads instead of bowing out because + * they are needed when we rejoin the replication group + * immediately as a view. + */ + ret = 0; else if (ret == DB_DELETED) ret = __repmgr_bow_out(env); if (t_ret != 0 && ret == 0) @@ -428,8 +496,10 @@ __repmgr_handle_event(env, event, info) void *info; { DB_REP *db_rep; + REP *rep; db_rep = env->rep_handle; + rep = db_rep->region; if (db_rep->selector == NULL) { /* Repmgr is not in use, so all events go to application. */ @@ -457,9 +527,46 @@ __repmgr_handle_event(env, event, info) /* Application still needs to see this. */ break; + case DB_EVENT_REP_MASTER: + case DB_EVENT_REP_STARTUPDONE: + /* + * Detect a rare case where a dupmaster or incomplete gmdb + * operation has left the site's gmdb inconsistent with + * a view callback definition. The user would have correctly + * defined a view callback and called repmgr_start(), but the + * gmdb operation to update this site to a view would have been + * incomplete or rolled back. The site cannot operate in this + * inconsistent state, so set an indicator to cause a message + * thread to panic and terminate. + * + * The one exception is during a demotion to view, when + * this inconsistency is expected for a short time. + */ + if (IS_VALID_EID(db_rep->self_eid) && + PARTICIPANT_TO_VIEW(db_rep, + SITE_FROM_EID(db_rep->self_eid)) && + !db_rep->demotion_pending) + db_rep->view_mismatch = TRUE; + + /* + * In preferred master mode, when the preferred master site + * finishes synchronizing with the temporary master it must + * prepare to take over as master. This is detected by the + * next heartbeat in a message thread, where the takeover is + * actually performed. + */ + if (event == DB_EVENT_REP_STARTUPDONE && + IS_PREFMAS_MODE(env) && + FLD_ISSET(rep->config, REP_C_PREFMAS_MASTER)) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "startupdone set preferred master switch")); + db_rep->prefmas_pending = master_switch; + } + break; default: break; } + COMPQUIET(info, NULL); return (DB_EVENT_NOT_HANDLED); } @@ -504,7 +611,7 @@ send_permlsn(env, generation, lsn) */ policy = site->ack_policy > 0 ? site->ack_policy : rep->perm_policy; - if (policy == DB_REPMGR_ACKS_NONE || + if (IS_VIEW_SITE(env) || policy == DB_REPMGR_ACKS_NONE || (IS_PEER_POLICY(policy) && rep->priority == 0)) ack = FALSE; else @@ -614,26 +721,149 @@ send_permlsn_conn(env, conn, generation, lsn) return (ret); } +/* + * Perform the steps on the preferred master site to take over again as + * preferred master from a temporary master. This routine should only be + * called after the preferred master has restarted as a client and finished + * a client sync with the temporary master. + * + * This routine makes a best effort to wait until all temporary master + * transactions have been applied on this site before taking over. + */ +static int +preferred_master_takeover(env) + ENV *env; +{ + DB_LOG *dblp; + DB_REP *db_rep; + LOG *lp; + REP *rep; + DB_LSN last_ready_lsn, ready_lsn, sync_lsn; + u_long usec; + u_int32_t gen, max_tries, tries; + int ret, synced; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + db_rep = env->rep_handle; + rep = db_rep->region; + gen = 0; + ZERO_LSN(sync_lsn); + ret = 0; + + if (!IS_PREFMAS_MODE(env)) + return (ret); + + /* + * Start by making the temporary master a readonly master so that we + * can know when we have applied all of its transactions on this + * site before taking over. + */ + if ((ret = __repmgr_make_site_readonly_master(env, + 1, &gen, &sync_lsn)) != 0) + return (ret); + DB_ASSERT(env, gen >= rep->gen); + + /* + * Make a best effort to wait until this site has all transactions + * from the temporary master. We want to preserve temporary master + * transactions, but we can't wait forever. If we exceed our wait, + * we restart this site as preferred master anyway. This may + * sacrifice some temporary master transactions in order to preserve + * repgroup write availability. + * + * We restart the number of tries each time we make progress in + * transactions applied, until either we apply through sync_lsn or + * we exceed max_tries without progress. + */ + if ((ret = __repmgr_prefmas_get_wait(env, &max_tries, &usec)) != 0) + return (ret); + tries = 0; + synced = 0; + ZERO_LSN(ready_lsn); + ZERO_LSN(last_ready_lsn); + while (!synced && tries < max_tries) { + __os_yield(env, 0, usec); + tries++; + /* + * lp->ready_lsn is the next LSN we expect to receive, + * which also indicates how much we've applied. sync_lsn + * is the lp->lsn (indicating the next log record expected) + * from the other site. + */ + MUTEX_LOCK(env, rep->mtx_clientdb); + ready_lsn = lp->ready_lsn; + MUTEX_UNLOCK(env, rep->mtx_clientdb); + if (gen == rep->gen && LOG_COMPARE(&ready_lsn, &sync_lsn) >= 0) + synced = 1; + else if (LOG_COMPARE(&ready_lsn, &last_ready_lsn) >= 0) { + /* We are making progress, restart number of tries. */ + last_ready_lsn = ready_lsn; + tries = 0; + } + } + + /* Restart the remote readonly temporary master as a client. */ + if ((ret = __repmgr_restart_site_as_client(env, 1)) != 0) + return (ret); + + /* Restart this site as the preferred master, waiting for + * REP_LOCKOUT_MSG. The NEWCLIENT message sent back from + * restarting the other site as client can briefly lock + * REP_LOCKOUT_MSG to do some cleanup. We don't want this + * to cause the rep_start_int() call to restart this site + * as master to return 0 without doing anything. + */ + ret = __repmgr_become_master(env, REP_START_WAIT_LOCKMSG); + return (ret); +} + static int serve_repmgr_request(env, msg) ENV *env; REPMGR_MESSAGE *msg; { - DB_THREAD_INFO *ip; + DB_REP *db_rep; DBT *dbt; + DB_THREAD_INFO *ip; REPMGR_CONNECTION *conn; + u_int32_t mtype; int ret, t_ret; - ENV_ENTER(env, ip); - switch (REPMGR_OWN_MSG_TYPE(msg->msg_hdr)) { + db_rep = env->rep_handle; + ENV_GET_THREAD_INFO(env, ip); + conn = msg->v.gmdb_msg.conn; + mtype = REPMGR_OWN_MSG_TYPE(msg->msg_hdr); + switch (mtype) { case REPMGR_JOIN_REQUEST: ret = serve_join_request(env, ip, msg); break; + case REPMGR_LSNHIST_REQUEST: + ret = serve_lsnhist_request(env, ip, msg); + break; + case REPMGR_READONLY_MASTER: + ret = serve_readonly_master_request(env, msg); + break; case REPMGR_REJOIN: RPRINT(env, (env, DB_VERB_REPMGR_MISC, "One try at rejoining group automatically")); if ((ret = __repmgr_join_group(env)) == DB_REP_UNAVAIL) ret = __repmgr_bow_out(env); + else if (ret == 0 && IS_PREFMAS_MODE(env)) { + /* + * For preferred master mode, we need to get + * a "regular" connection to the other site without + * calling an election prematurely here. + */ + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Establishing connections after rejoin")); + ret = rejoin_connections(env); + } else if (ret == 0 && db_rep->rejoin_pending) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Calling deferred election after rejoin")); + ret = rejoin_deferred_election(env); + } + db_rep->rejoin_pending = FALSE; break; case REPMGR_REMOVE_REQUEST: ret = serve_remove_request(env, ip, msg); @@ -641,23 +871,32 @@ serve_repmgr_request(env, msg) case REPMGR_RESOLVE_LIMBO: ret = resolve_limbo_wrapper(env, ip); break; + case REPMGR_RESTART_CLIENT: + ret = serve_restart_client_request(env, msg); + break; case REPMGR_SHARING: dbt = &msg->v.gmdb_msg.request; - ret = __repmgr_refresh_membership(env, dbt->data, dbt->size); + ret = __repmgr_refresh_membership(env, dbt->data, dbt->size, + (conn == NULL ? DB_REPMGR_VERSION : conn->version)); break; default: ret = __db_unknown_path(env, "serve_repmgr_request"); break; } - if ((conn = msg->v.gmdb_msg.conn) != NULL) { + if (conn != NULL) { + /* + * A site that removed itself may have already closed its + * connections. Do not return an error and panic if we + * can't close the one-shot GMDB connection for a remove + * request here. + */ if ((t_ret = __repmgr_close_connection(env, conn)) != 0 && - ret == 0) + ret == 0 && mtype != REPMGR_REMOVE_REQUEST) ret = t_ret; if ((t_ret = __repmgr_decr_conn_ref(env, conn)) != 0 && ret == 0) ret = t_ret; } - ENV_LEAVE(env, ip); return (ret); } @@ -674,8 +913,10 @@ serve_join_request(env, ip, msg) { DB_REP *db_rep; REPMGR_CONNECTION *conn; + REPMGR_SITE *site; DBT *dbt; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; u_int8_t *buf; char *host; size_t len; @@ -686,9 +927,18 @@ serve_join_request(env, ip, msg) COMPQUIET(status, 0); conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); dbt = &msg->v.gmdb_msg.request; - ret = __repmgr_site_info_unmarshal(env, - &site_info, dbt->data, dbt->size, NULL); + if (conn->version < 5) { + ret = __repmgr_v4site_info_unmarshal(env, + &v4site_info, dbt->data, dbt->size, NULL); + site_info.host = v4site_info.host; + site_info.port = v4site_info.port; + site_info.status = v4site_info.flags; + site_info.flags = 0; + } else + ret = __repmgr_site_info_unmarshal(env, + &site_info, dbt->data, dbt->size, NULL); host = site_info.host.data; host[site_info.host.size - 1] = '\0'; @@ -703,7 +953,23 @@ serve_join_request(env, ip, msg) LOCK_MUTEX(db_rep->mutex); if ((ret = __repmgr_find_site(env, host, site_info.port, &eid)) == 0) { DB_ASSERT(env, eid != db_rep->self_eid); - status = SITE_FROM_EID(eid)->membership; + site = SITE_FROM_EID(eid); + status = site->membership; + /* + * Remote site electability is usually exchanged when + * a connection is established, but when a new site + * joins the repgroup there is a brief gap between the + * join and the connection. Record electability for + * the joining site so that we are not overly conservative + * about the number of acks we require for a PERM + * transaction if the joining site is unelectable. + */ + if (FLD_ISSET(site_info.flags, SITE_JOIN_ELECTABLE)) { + F_SET(site, SITE_ELECTABLE); + FLD_CLR(site_info.flags, SITE_JOIN_ELECTABLE); + } else + F_CLR(site, SITE_ELECTABLE); + F_SET(site, SITE_HAS_PRIO); } UNLOCK_MUTEX(db_rep->mutex); if (ret != 0) @@ -712,7 +978,8 @@ serve_join_request(env, ip, msg) switch (status) { case 0: case SITE_ADDING: - ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING); + ret = __repmgr_update_membership(env, ip, eid, SITE_ADDING, + site_info.flags); break; case SITE_PRESENT: /* Already in desired state. */ @@ -729,7 +996,7 @@ serve_join_request(env, ip, msg) goto err; LOCK_MUTEX(db_rep->mutex); - ret = __repmgr_marshal_member_list(env, &buf, &len); + ret = __repmgr_marshal_member_list(env, conn->version, &buf, &len); UNLOCK_MUTEX(db_rep->mutex); if (ret != 0) goto err; @@ -760,6 +1027,7 @@ serve_remove_request(env, ip, msg) REPMGR_SITE *site; DBT *dbt; __repmgr_site_info_args site_info; + __repmgr_v4site_info_args v4site_info; char *host; u_int32_t status, type; int eid, ret, t_ret; @@ -768,9 +1036,18 @@ serve_remove_request(env, ip, msg) db_rep = env->rep_handle; conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); dbt = &msg->v.gmdb_msg.request; - ret = __repmgr_site_info_unmarshal(env, - &site_info, dbt->data, dbt->size, NULL); + if (conn->version < 5) { + ret = __repmgr_v4site_info_unmarshal(env, + &v4site_info, dbt->data, dbt->size, NULL); + site_info.host = v4site_info.host; + site_info.port = v4site_info.port; + site_info.status = v4site_info.flags; + site_info.flags = 0; + } else + ret = __repmgr_site_info_unmarshal(env, + &site_info, dbt->data, dbt->size, NULL); host = site_info.host.data; host[site_info.host.size - 1] = '\0'; @@ -810,7 +1087,8 @@ serve_remove_request(env, ip, msg) break; case SITE_PRESENT: case SITE_DELETING: - ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING); + ret = __repmgr_update_membership(env, ip, eid, SITE_DELETING, + site_info.flags); break; default: ret = __db_unknown_path(env, "serve_remove_request"); @@ -829,7 +1107,175 @@ err: default: return (ret); } - return (__repmgr_send_sync_msg(env, conn, type, NULL, 0)); + /* + * It is possible when a site removes itself that by now it has + * already acted on the first GMDB update and closed its connections. + * Do not return an error and panic if we can't send the final + * status of the remove operation. + */ + if ((ret = __repmgr_send_sync_msg(env, conn, type, NULL, 0)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending remove site status message %d", ret)); + return (0); +} + +/* + * Serve the REPMGR_RESTART_CLIENT message by restarting this site as a + * client if it is not already a client. Always sends back a + * REPMGR_PREFMAS_SUCCESS message with an empty payload. + */ +static int +serve_restart_client_request(env, msg) + ENV *env; + REPMGR_MESSAGE *msg; +{ + DB_REP *db_rep; + REP * rep; + REPMGR_CONNECTION *conn; + int ret, t_ret; + + db_rep = env->rep_handle; + rep = db_rep->region; + ret = 0; + + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Serving restart_client request")); + conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + /* No need to read payload - it is just a dummy byte. */ + + if (IS_PREFMAS_MODE(env) && !F_ISSET(rep, REP_F_CLIENT)) + ret = __repmgr_become_client(env); + + if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_PREFMAS_SUCCESS, NULL, 0)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending restart client success message %d", ret)); + + if (ret == 0 && t_ret != 0) + ret = t_ret; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Request for restart_client returning %d", ret)); + return (ret); +} + +/* + * Serve the REPMGR_READONLY_MASTER message by turning this site into a + * readonly master. Always sends back a REPMGR_READONLY_RESPONSE message with + * a payload containing this site's gen and next LSN expected. If there are + * any errors, the gen is 0 and the next LSN is [0,0]. + */ +static int +serve_readonly_master_request(env, msg) + ENV *env; + REPMGR_MESSAGE *msg; +{ + REPMGR_CONNECTION *conn; + __repmgr_permlsn_args permlsn; + u_int8_t buf[__REPMGR_PERMLSN_SIZE]; + int ret, t_ret; + + ret = 0; + + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Serving readonly_master request")); + conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + /* No need to read payload - it is just a dummy byte. */ + + if (IS_PREFMAS_MODE(env)) + ret = __rep_become_readonly_master(env, + &permlsn.generation, &permlsn.lsn); + + __repmgr_permlsn_marshal(env, &permlsn, buf); + if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_READONLY_RESPONSE, buf, __REPMGR_PERMLSN_SIZE)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending readonly response message %d", ret)); + if (ret == 0 && t_ret != 0) + ret = t_ret; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Request for readonly_master returning %d", ret)); + return (ret); +} + +/* + * Serve the REPMGR_LSNHIST_REQUEST message by retrieving information from + * this site's LSN history database for the requested gen. If the requested + * gen exists at this site, sends back a REPMGR_LSNHIST_RESPONSE message + * containing the LSN and timestamp at the requested gen and the LSN for the + * next gen if that gen exists (next gen LSN is [0,0] if next gen doesn't + * yet exist at this site.) Sends back a PREFMAS_FAILURE message if the + * requested gen does not yet exist at this site or if there are any errors. + */ +static int +serve_lsnhist_request(env, ip, msg) + ENV *env; + DB_THREAD_INFO *ip; + REPMGR_MESSAGE *msg; +{ + REPMGR_CONNECTION *conn; + DBT *dbt; + __repmgr_lsnhist_match_args lsnhist_match; + __rep_lsn_hist_data_args lsnhist_data, next_lsnhist_data; + __rep_lsn_hist_key_args key; + u_int8_t match_buf[__REPMGR_LSNHIST_MATCH_SIZE]; + DB_LSN next_gen_lsn; + int ret, t_ret; + + RPRINT(env, (env, DB_VERB_REPMGR_MISC, "Serving lsnhist request")); + conn = msg->v.gmdb_msg.conn; + DB_ASSERT(env, conn->version > 0 && conn->version <= DB_REPMGR_VERSION); + /* Read lsn_hist_key incoming payload to get gen being requested. */ + dbt = &msg->v.gmdb_msg.request; + if ((ret = __rep_lsn_hist_key_unmarshal(env, + &key, dbt->data, dbt->size, NULL)) != 0) + return (ret); + if (key.version != REP_LSN_HISTORY_FMT_VERSION) { + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "serve_lsnhist_request version mismatch")); + return (0); + } + + /* + * There's no need to retry if we don't find an lsnhist record for + * requested gen. This site is either a temporary master or a client, + * which means that if it doesn't already have an lsnhist record at + * this gen, it is highly unlikely to get one in the near future. + */ + if ((ret = __rep_get_lsnhist_data(env, + ip, key.gen, &lsnhist_data)) == 0) { + + if ((t_ret = __rep_get_lsnhist_data(env, + ip, key.gen + 1, &next_lsnhist_data)) == 0) + next_gen_lsn = next_lsnhist_data.lsn; + else + ZERO_LSN(next_gen_lsn); + + lsnhist_match.lsn = lsnhist_data.lsn; + lsnhist_match.hist_sec = lsnhist_data.hist_sec; + lsnhist_match.hist_nsec = lsnhist_data.hist_nsec; + lsnhist_match.next_gen_lsn = next_gen_lsn; + __repmgr_lsnhist_match_marshal(env, &lsnhist_match, match_buf); + if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_LSNHIST_RESPONSE, match_buf, + __REPMGR_LSNHIST_MATCH_SIZE)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending lsnhist response message %d", + ret)); + } else if ((t_ret = __repmgr_send_sync_msg(env, conn, + REPMGR_PREFMAS_FAILURE, NULL, 0)) != 0) + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Problem sending prefmas failure message %d", ret)); + + /* Do not return an error if LSN history record not found. */ + if (ret == DB_NOTFOUND) + ret = 0; + if (ret == 0 && t_ret != 0) + ret = t_ret; + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Request for lsnhist returning %d", ret)); + return (ret); } /* @@ -917,7 +1363,13 @@ resolve_limbo_int(env, ip) if (orig_status == SITE_PRESENT || orig_status == 0) goto out; - if (IS_ZERO_LSN(db_rep->limbo_failure)) + /* + * It is possible after an autotakeover on a master to have no + * limbo_failure LSN but to have a limbo_victim that was found + * in the gmdb that still needs to be resolved. + */ + if (IS_ZERO_LSN(db_rep->limbo_failure) && + !db_rep->limbo_resolution_needed) goto out; /* @@ -947,7 +1399,8 @@ resolve_limbo_int(env, ip) ip, NULL, &txn, DB_IGNORE_LEASE)) != 0) goto out; - marshal_site_data(env, orig_status, data_buf, &data_dbt); + marshal_site_data(env, + orig_status, site->gmdb_flags, data_buf, &data_dbt); ret = __db_put(db_rep->gmdb, ip, txn, &key_dbt, &data_dbt, 0); if ((t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && @@ -980,15 +1433,15 @@ resolve_limbo_int(env, ip) UNLOCK_MUTEX(db_rep->mutex); locked = FALSE; status = NEXT_STATUS(orig_status); - if ((ret = finish_gmdb_update(env, - ip, &key_dbt, orig_status, status, &logrec)) != 0) + if ((ret = finish_gmdb_update(env, ip, + &key_dbt, orig_status, status, site->gmdb_flags, &logrec)) != 0) goto out; /* Track modified membership status in our in-memory sites array. */ LOCK_MUTEX(db_rep->mutex); locked = TRUE; if ((ret = __repmgr_set_membership(env, - addr.host, addr.port, status)) != 0) + addr.host, addr.port, status, site->gmdb_flags)) != 0) goto out; __repmgr_set_sites(env); @@ -1005,14 +1458,15 @@ out: * status is inferred (ADDING -> PRESENT, or DELETING -> 0). * * PUBLIC: int __repmgr_update_membership __P((ENV *, - * PUBLIC: DB_THREAD_INFO *, int, u_int32_t)); + * PUBLIC: DB_THREAD_INFO *, int, u_int32_t, u_int32_t)); */ int -__repmgr_update_membership(env, ip, eid, pstatus) +__repmgr_update_membership(env, ip, eid, pstatus, site_flags) ENV *env; DB_THREAD_INFO *ip; int eid; u_int32_t pstatus; /* Provisional status. */ + u_int32_t site_flags; { DB_REP *db_rep; REPMGR_SITE *site; @@ -1092,7 +1546,7 @@ retry: * those seem even more confusing. */ if ((ret = __repmgr_set_membership(env, - addr.host, addr.port, pstatus)) != 0) + addr.host, addr.port, pstatus, site_flags)) != 0) goto err; __repmgr_set_sites(env); @@ -1108,7 +1562,7 @@ retry: if ((ret = __txn_begin(env, ip, NULL, &txn, DB_IGNORE_LEASE)) != 0) goto err; marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec); - marshal_site_data(env, pstatus, status_buf, &data_dbt); + marshal_site_data(env, pstatus, site_flags, status_buf, &data_dbt); if ((ret = __db_put(db_rep->gmdb, ip, txn, &key_dbt, &data_dbt, 0)) != 0) goto err; @@ -1152,13 +1606,14 @@ retry: locked = FALSE; if ((ret = finish_gmdb_update(env, ip, - &key_dbt, pstatus, ult_status, &logrec)) != 0) + &key_dbt, pstatus, ult_status, site_flags, &logrec)) != 0) goto err; /* Track modified membership status in our in-memory sites array. */ LOCK_MUTEX(db_rep->mutex); locked = TRUE; - ret = __repmgr_set_membership(env, addr.host, addr.port, ult_status); + ret = __repmgr_set_membership(env, addr.host, addr.port, + ult_status, site_flags); __repmgr_set_sites(env); err: @@ -1173,7 +1628,7 @@ err: * that we keep in sync. */ (void)__repmgr_set_membership(env, - addr.host, addr.port, orig_status); + addr.host, addr.port, orig_status, site_flags); } if ((t_ret = __repmgr_cleanup_gmdb_op(env, do_close)) != 0 && ret == 0) @@ -1215,13 +1670,14 @@ retry: UNLOCK_MUTEX(db_rep->mutex); marshal_site_key(env, &addr, key_buf, &key_dbt, &logrec); - if ((ret = finish_gmdb_update(env, - ip, &key_dbt, cur_status, new_status, &logrec)) != 0) + if ((ret = finish_gmdb_update(env, ip, + &key_dbt, cur_status, new_status, site->gmdb_flags, &logrec)) != 0) goto err; /* Track modified membership status in our in-memory sites array. */ LOCK_MUTEX(db_rep->mutex); - ret = __repmgr_set_membership(env, addr.host, addr.port, new_status); + ret = __repmgr_set_membership(env, addr.host, addr.port, + new_status, site->gmdb_flags); __repmgr_set_sites(env); UNLOCK_MUTEX(db_rep->mutex); @@ -1301,11 +1757,11 @@ __repmgr_set_gm_version(env, ip, txn, version) * really deleted. */ static int -finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec) +finish_gmdb_update(env, ip, key_dbt, prev_status, status, flags, logrec) ENV *env; DB_THREAD_INFO *ip; DBT *key_dbt; - u_int32_t prev_status, status; + u_int32_t prev_status, status, flags; __repmgr_member_args *logrec; { DB_REP *db_rep; @@ -1324,7 +1780,7 @@ finish_gmdb_update(env, ip, key_dbt, prev_status, status, logrec) if (status == 0) ret = __db_del(db_rep->gmdb, ip, txn, key_dbt, 0); else { - marshal_site_data(env, status, data_buf, &data_dbt); + marshal_site_data(env, status, flags, data_buf, &data_dbt); ret = __db_put(db_rep->gmdb, ip, txn, key_dbt, &data_dbt, 0); } if (ret != 0) @@ -1617,16 +2073,18 @@ marshal_site_key(env, addr, buf, dbt, logrec) } static void -marshal_site_data(env, status, buf, dbt) +marshal_site_data(env, status, flags, buf, dbt) ENV *env; u_int32_t status; + u_int32_t flags; u_int8_t *buf; DBT *dbt; { - __repmgr_membership_data_args member_status; + __repmgr_membership_data_args member_data; - member_status.flags = status; - __repmgr_membership_data_marshal(env, &member_status, buf); + member_data.status = status; + member_data.flags = flags; + __repmgr_membership_data_marshal(env, &member_data, buf); DB_INIT_DBT(*dbt, buf, __REPMGR_MEMBERSHIP_DATA_SIZE); } @@ -1640,16 +2098,107 @@ __repmgr_set_sites(env) ENV *env; { DB_REP *db_rep; + REP *rep; int ret; u_int32_t n; u_int i; db_rep = env->rep_handle; + rep = db_rep->region; for (i = 0, n = 0; i < db_rep->site_cnt; i++) { - if (db_rep->sites[i].membership > 0) + /* + * Views do not count towards nsites because they cannot + * vote in elections, become master or contribute to + * durability. + */ + if (db_rep->sites[i].membership > 0 && + !FLD_ISSET(db_rep->sites[i].gmdb_flags, SITE_VIEW)) n++; } ret = __rep_set_nsites_int(env, n); DB_ASSERT(env, ret == 0); + if (FLD_ISSET(rep->config, + REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT) && + rep->config_nsites > 2) + __db_errx(env, DB_STR("3701", + "More than two sites in preferred master replication group")); +} + +/* + * If a site is rejoining a 2-site repgroup with 2SITE_STRICT off + * and has a rejection because it needs to catch up with the latest + * group membership database, it cannot call an election right away + * because it would win with only its own vote and ignore an existing + * master in the repgroup. Instead, this routine is used to call the + * deferred election after the site has rejoined the repgroup successfully. + */ +static int +rejoin_deferred_election(env) + ENV *env; +{ + DB_REP *db_rep; + u_int32_t flags; + int eid, ret; + + db_rep = env->rep_handle; + LOCK_MUTEX(db_rep->mutex); + + /* + * First, retry all connections so that the election can communicate + * with the other sites. Normally there should only be one other + * site in the repgroup, but it is safest to retry all remote sites + * found in case the group membership changed while we were gone. + */ + FOR_EACH_REMOTE_SITE_INDEX(eid) { + if ((ret = + __repmgr_schedule_connection_attempt(env, eid, TRUE)) != 0) + break; + } + + /* + * Call an immediate, but not a fast, election because a fast + * election reduces the number of votes needed by 1. + */ + flags = ELECT_F_EVENT_NOTIFY; + if (FLD_ISSET(db_rep->region->config, REP_C_ELECTIONS)) + LF_SET(ELECT_F_IMMED); + else + RPRINT(env, (env, DB_VERB_REPMGR_MISC, + "Deferred rejoin election, but no elections")); + ret = __repmgr_init_election(env, flags); + + UNLOCK_MUTEX(db_rep->mutex); + return (ret); +} +/* + * If a site is rejoining a preferred master replication group and has a + * rejection because it needs to catch up with the latest group membership + * database, it needs to establish its "regular" connection to the other site + * so that it can proceed through the preferred master startup sequence. + */ +static int +rejoin_connections(env) + ENV *env; +{ + DB_REP *db_rep; + int eid, ret; + + db_rep = env->rep_handle; + ret = 0; + LOCK_MUTEX(db_rep->mutex); + + /* + * Retry all connections. Normally there should only be one other + * site in the repgroup, but it is safest to retry all remote sites + * found in case the group membership changed while we were gone. + */ + FOR_EACH_REMOTE_SITE_INDEX(eid) { + if ((ret = + __repmgr_schedule_connection_attempt(env, eid, TRUE)) != 0) + break; + } + + UNLOCK_MUTEX(db_rep->mutex); + return (ret); } |
