From: Petr Jelinek Date: Mon, 9 Jun 2014 08:57:09 +0000 (+0200) Subject: bdr: Add bdr_init_replica utility for starting up new bdr node from a pg_basebackup... X-Git-Url: http://git.postgresql.org/gitweb/static/gitweb.js?a=commitdiff_plain;h=c7887adb61098faded4fe3b60c57d6e9a3f61d90;p=users%2Fandresfreund%2Fpostgres.git bdr: Add bdr_init_replica utility for starting up new bdr node from a pg_basebackup copy. --- diff --git a/contrib/bdr/Makefile b/contrib/bdr/Makefile index 62d4975496..812cd8324b 100644 --- a/contrib/bdr/Makefile +++ b/contrib/bdr/Makefile @@ -10,6 +10,7 @@ include $(top_srcdir)/contrib/contrib-global.mk all: $(MAKE) -f $(top_srcdir)/contrib/bdr/output.mk $(MAKECMDGOALS) $(MAKE) -f $(top_srcdir)/contrib/bdr/worker.mk $(MAKECMDGOALS) + $(MAKE) -f $(top_srcdir)/contrib/bdr/bdr_init_copy.mk $(MAKECMDGOALS) clean: all diff --git a/contrib/bdr/bdr.c b/contrib/bdr/bdr.c index 516585174c..75cef72c1d 100644 --- a/contrib/bdr/bdr.c +++ b/contrib/bdr/bdr.c @@ -18,6 +18,7 @@ #include "bdr_locks.h" #include "libpq-fe.h" +#include "funcapi.h" #include "miscadmin.h" #include "pgstat.h" #include "port.h" @@ -40,6 +41,8 @@ #include "mb/pg_wchar.h" +#include "nodes/execnodes.h" + #include "postmaster/bgworker.h" #include "replication/replication_identifier.h" @@ -112,9 +115,12 @@ static void bdr_worker_shmem_create_workers(void); Datum bdr_apply_pause(PG_FUNCTION_ARGS); Datum bdr_apply_resume(PG_FUNCTION_ARGS); +Datum bdr_get_connection_config(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(bdr_apply_pause); PG_FUNCTION_INFO_V1(bdr_apply_resume); +PG_FUNCTION_INFO_V1(bdr_get_connection_config); + /* * Converts an int64 to network byte order. @@ -600,6 +606,7 @@ bdr_apply_main(Datum main_arg) * Check whether we already replayed something so we don't replay it * multiple times. */ + start_from = RemoteCommitFromCachedReplicationIdentifier(); elog(INFO, "starting up replication from %u at %X/%X", @@ -1606,6 +1613,15 @@ _PG_init(void) 0, NULL, NULL, NULL); + DefineCustomBoolVariable("bdr.init_from_basedump", + "Internal. Set during local initialization from basebackup only", + NULL, + &bdr_init_from_basedump, + false, + PGC_BACKEND, + 0, + NULL, NULL, NULL); + bdr_conflict_logging_create_gucs(); /* if nothing is configured, we're done */ @@ -1638,7 +1654,7 @@ _PG_init(void) */ if (bdr_max_workers == -1) { - bdr_max_workers = list_length(connames) * 2; + bdr_max_workers = list_length(connames) * 3; elog(DEBUG1, "bdr: bdr_max_workers unset, configuring for %d workers", bdr_max_workers); } @@ -1886,3 +1902,59 @@ bdr_apply_resume(PG_FUNCTION_ARGS) BdrWorkerCtl->pause_apply = false; PG_RETURN_VOID(); } + +Datum +bdr_get_connection_config(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + uint32 off; + + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + if (tupdesc->natts != 2) + elog(ERROR, "wrong function definition"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + for (off = 0; off < bdr_max_workers; off++) + { + Datum values[2]; + bool nulls[2]; + BdrConnectionConfig *cfg = bdr_connection_configs[off]; + + if (cfg == NULL || !cfg->is_valid) + continue; + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + values[0] = PointerGetDatum(cfg->dbname); + values[1] = PointerGetDatum(cfg->dsn); + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} diff --git a/contrib/bdr/bdr.h b/contrib/bdr/bdr.h index f54cd2bdb7..8e51693349 100644 --- a/contrib/bdr/bdr.h +++ b/contrib/bdr/bdr.h @@ -16,9 +16,9 @@ #include "utils/resowner.h" #include "storage/lock.h" +#include "bdr_internal.h" + #define BDR_VERSION_NUM 500 -#define BDR_SLOT_NAME_FORMAT "bdr_%u_%s_%u_%u__%s" -#define BDR_NODE_ID_FORMAT "bdr_"UINT64_FORMAT"_%u_%u_%u_%s" /* Right now replication_name isn't used; make it easily found for later */ #define EMPTY_REPLICATION_NAME "" @@ -191,23 +191,6 @@ typedef struct BdrWorker } BdrWorker; -/* GUC storage for a configured BDR connection. */ -typedef struct BdrConnectionConfig -{ - char *dsn; - int apply_delay; - bool init_replica; - char *replica_local_dsn; - /* - * These aren't technically GUCs, but are per-connection config - * information obtained from the GUCs. - */ - char *name; - char *dbname; - /* Connection config might be broken (blank dsn, etc) */ - bool is_valid; -} BdrConnectionConfig; - /* * Params for every connection in bdr.connections. * @@ -219,6 +202,7 @@ extern BdrConnectionConfig **bdr_connection_configs; extern int bdr_default_apply_delay; extern int bdr_max_workers; extern char *bdr_temp_dump_directory; +extern bool bdr_init_from_basedump; /* * Header for the shared memory segment ref'd by the BdrWorkerCtl ptr, diff --git a/contrib/bdr/bdr_init_copy.c b/contrib/bdr/bdr_init_copy.c new file mode 100644 index 0000000000..4c64397ef6 --- /dev/null +++ b/contrib/bdr/bdr_init_copy.c @@ -0,0 +1,1564 @@ +#include "postgres_fe.h" + +#include "port.h" + +#include "libpq-fe.h" +#include "libpq-int.h" + +#include "miscadmin.h" + +#include "access/timeline.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bdr_internal.h" + +#define LLOGCDIR "pg_llog/checkpoints" + +typedef struct RemoteInfo { + uint64 sysid; + TimeLineID tlid; + Oid dboid; +} RemoteInfo; + +static char *argv0 = NULL; +static const char *progname; +static uint64 system_identifier; +static NameData restore_point_name; +static char *data_dir = NULL; +static char pid_file[MAXPGPATH]; +static time_t start_time; + +/* defined as static so that die() can close them */ +static PGconn *local_conn = NULL; +static PGconn *remote_conn = NULL; + +BdrConnectionConfig **bdr_connection_configs; +size_t bdr_connection_config_count; + +static void signal_handler(int sig); +static void usage(void); +static void die(const char *fmt,...) +__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); +static void print_msg(const char *fmt,...) +__attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); + +static int run_pg_ctl(const char *arg); +static char *get_postgres_guc_value(char *guc, char *defval); +static void wait_postmaster_connection(char **out_connstr); +static void wait_postgres_shutdown(void); + +static void remove_unwanted_state(void); +static void initialize_replication_identifiers(char *remote_lsn); +static void create_replication_identifier(PGconn *conn, + const char *remote_ident, char *remote_lsn); +static char *create_restore_point(char *remote_connstr); +static void initialize_replication_slots(bool init_replica); +static void create_replication_slot(PGconn *conn, Name slot_name); +static RemoteInfo *get_remote_info(PGconn *conn); +static Oid get_dboid_from_dbname(PGconn *conn, const char* dbname); + +static uint64 GenerateSystemIdentifier(void); +static int set_sysid(void); + +static void read_bdr_config(void); +static void WriteRecoveryConf(PQExpBuffer contents); + +static char *detect_remote_conninfo(void); +char *get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser); +static char *PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values); +static char *escapeConninfoValue(const char *val); + +static bool parse_bool(const char *value, bool *result); +static bool parse_bool_with_len(const char *value, size_t len, bool *result); +static char *trimwhitespace(char *str); +static char **split_list_guc(char *str, size_t *count); + +static bool is_pg_dir(char *path); +static char *find_other_exec_or_die(const char *argv0, const char *target, const char *versionstr); +static bool postmaster_is_alive(pid_t pid); +static long get_pgpid(void); +static char **readfile(const char *path); +static void free_readfile(char **optlines); + + +void signal_handler(int sig) +{ + if (sig == SIGINT) + { + die(_("\nCanceling...\n")); + } +} + + +int +main(int argc, char **argv) +{ + int i; + int c; + PQExpBuffer recoveryconfcontents = createPQExpBuffer(); + char *remote_lsn; + bool hot_standby; + char *local_connstr = NULL; + char *remote_connstr = NULL; + char *dbhost = NULL, + *dbport = NULL, + *dbuser = NULL; + + argv0 = argv[0]; + progname = get_progname(argv[0]); + start_time = time(NULL); + signal(SIGINT, signal_handler); + + /* check for --help */ + if (argc > 1) + { + for (i = 1; i < argc; i++) + { + if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-?") == 0) + { + usage(); + exit(0); + } + } + } + + /* Option parsing and validation */ + while ((c = getopt(argc, argv, "D:d:h:p:U:")) != -1) + { + switch (c) + { + case 'D': + data_dir = pg_strdup(optarg); + break; + case 'd': + remote_connstr = pg_strdup(optarg); + break; + case 'h': + dbhost = pg_strdup(optarg); + break; + case 'p': + dbport = pg_strdup(optarg); + break; + case 'U': + dbuser = pg_strdup(optarg); + break; + default: + fprintf(stderr, _("%s: unknown option\n"), progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + } + + if (data_dir == NULL) + { + fprintf(stderr, _("%s: no data directory specified\n"), progname); + fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname); + exit(1); + } + if (!is_pg_dir(data_dir)) + { + die(_("%s: \"%s\" is not valid postgres data directory\n"), progname, data_dir); + } + snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", data_dir); + + print_msg(_("%s: starting...\n"), progname); + + /* + * Initialization + */ + system_identifier = GenerateSystemIdentifier(); + read_bdr_config(); + + if (!remote_connstr && !dbhost && !dbport && !dbuser) + remote_connstr = detect_remote_conninfo(); + else + remote_connstr = get_conninfo(remote_connstr, dbhost, dbport, dbuser); + + if (!remote_connstr || !strlen(remote_connstr)) + die(_("Could not detect remote connection\n")); + + /* Hot standby would start cluster in read only mode, we don't want that. */ + if (!parse_bool(get_postgres_guc_value("hot_standby", NULL), &hot_standby)) + die(_("Invalid boolean value for configuration parameter \"hot_standby\"\n")); + if (hot_standby) + die(_("Cluster cannot be configured with hot_standby = on when using bdr\n")); + + remove_unwanted_state(); + + /* + * Initialization done, create replication slots to init node + * and restore point on remote side. + */ + print_msg(_("Creating primary replication slots...\n")); + initialize_replication_slots(true); + + print_msg(_("Creating restore point...\n")); + snprintf(NameStr(restore_point_name), NAMEDATALEN, + "bdr_"UINT64_FORMAT, system_identifier); + remote_lsn = create_restore_point(remote_connstr); + + /* + * Get local db to consistent state (for lsn after slot creation). + */ + print_msg(_("Bringing cluster to the restore point...\n")); + appendPQExpBuffer(recoveryconfcontents, "standby_mode = 'on'\n"); + appendPQExpBuffer(recoveryconfcontents, "recovery_target_name = '%s'\n", NameStr(restore_point_name)); + appendPQExpBuffer(recoveryconfcontents, "recovery_target_inclusive = true\n"); + appendPQExpBuffer(recoveryconfcontents, "primary_conninfo = '%s'\n", remote_connstr); + WriteRecoveryConf(recoveryconfcontents); + + run_pg_ctl("start -w -l \"bdr_init_copy_postgres.log\" -o \"-c shared_preload_libraries=''\""); + wait_postmaster_connection(&local_connstr); + + if (local_connstr == NULL) + die(_("Failed to detect local connection info.")); + + /* + * Postgres should have reached restore point and is accepting connections, + * create slots to other nodes and local replication identifiers. + */ + local_conn = PQconnectdb(local_connstr); + if (PQstatus(local_conn) != CONNECTION_OK) + die(_("Connection to database failed: %s"), PQerrorMessage(local_conn)); + + print_msg(_("Creating secondary replication slots...\n")); + initialize_replication_slots(false); + print_msg(_("Creating local replication identifier...\n")); + initialize_replication_identifiers(remote_lsn); + + PQfinish(local_conn); + local_conn = NULL; + + /* + * Make this node functional as individual bdr node and start it. + */ + run_pg_ctl("stop"); + wait_postgres_shutdown(); + + set_sysid(); + + print_msg(_("Starting the cluster...\n")); + run_pg_ctl("start -w -o \"-c bdr.init_from_basedump=true\""); + + return 0; +} + + +/* + * Print help. + */ +static void +usage(void) +{ + printf(_("%s initializes bdr from PostgreSQL instance made using pg_basebackup.\n\n"), progname); + printf(_("Usage:\n")); + printf(_(" %s [OPTION]...\n"), progname); + printf(_("\nGeneral options:\n")); + printf(_(" -D, --pgdata=DIRECTORY base backup directory\n")); + printf(_("\nConnection options:\n")); + printf(_(" -d, --dbname=CONNSTR connection string\n")); + printf(_(" -h, --host=HOSTNAME database server host or socket directory\n")); + printf(_(" -p, --port=PORT database server port number\n")); + printf(_(" -U, --username=NAME connect as specified database user\n")); +} + +/* + * Print error and exit. + */ +static void +die(const char *fmt,...) +{ + va_list argptr; + va_start(argptr, fmt); + vfprintf(stderr, fmt, argptr); + va_end(argptr); + + PQfinish(local_conn); + PQfinish(remote_conn); + + if (get_pgpid()) + run_pg_ctl("stop -s"); + + exit(1); +} + +/* + * Print message to stdout and flush + */ +static void +print_msg(const char *fmt,...) +{ + va_list argptr; + va_start(argptr, fmt); + vfprintf(stdout, fmt, argptr); + va_end(argptr); + fflush(stdout); +} + + +/* + * Start pg_ctl with given argument(s) - used to start/stop postgres + */ +static int +run_pg_ctl(const char *arg) +{ + int ret; + PQExpBuffer cmd = createPQExpBuffer(); + char *exec_path = find_other_exec_or_die(argv0, "pg_ctl", "pg_ctl (PostgreSQL) " PG_VERSION "\n"); + + appendPQExpBuffer(cmd, "%s %s -D \"%s\"", exec_path, arg, data_dir); + + ret = system(cmd->data); + + destroyPQExpBuffer(cmd); + + return ret; +} + + +/* + * Ugly way to read postgresql.conf + */ +static char * +get_postgres_guc_value(char *guc, char *defval) +{ + FILE *fp; + int status; + PQExpBuffer cmd = createPQExpBuffer(); + char *exec_path = find_other_exec_or_die(argv0, "postgres", PG_BACKEND_VERSIONSTR); + PQExpBuffer retbuf = createPQExpBuffer(); + char buf[8192]; + char *ret; + + printfPQExpBuffer(cmd, "%s -D \"%s\" -C \"%s\" 2>\"%s\"", exec_path, data_dir, guc, DEVNULL); + + fp = popen(cmd->data, "r"); + while (fgets(buf, sizeof(buf), fp) != NULL) + appendPQExpBufferStr(retbuf, buf); + + status = pclose(fp); + destroyPQExpBuffer(cmd); + + if (status != 0) + { + destroyPQExpBuffer(retbuf); + return defval; + } + + ret = trimwhitespace(retbuf->data); + destroyPQExpBuffer(retbuf); + + return ret; +} + +/* + * Set system identifier to system id we used for registering the slots. + */ +static int +set_sysid(void) +{ + int ret; + PQExpBuffer cmd = createPQExpBuffer(); + char *exec_path = find_other_exec_or_die(argv0, "pg_resetxlog", "pg_resetxlog (PostgreSQL) " PG_VERSION "\n"); + + appendPQExpBuffer(cmd, "%s \"-s "UINT64_FORMAT"\" \"%s\"", exec_path, system_identifier, data_dir); + + ret = system(cmd->data); + + destroyPQExpBuffer(cmd); + + return ret; +} + + +/* + * Read bdr configuration + * + * This is somewhat ugly version of bdr_create_con_gucs and parts of _PG_init + */ +static void +read_bdr_config(void) +{ + char *connections; + char *errormsg = NULL; + int connection_config_idx; + size_t connection_count = 0; + char **connames; + PQconninfoOption *options; + PQconninfoOption *cur_option; + + connections = get_postgres_guc_value("bdr.connections", NULL); + if (!connections) + die(_("bdr.connections is empty")); + + connames = split_list_guc(connections, &connection_count); + pg_free(connections); + + bdr_connection_config_count = connection_count; + bdr_connection_configs = (BdrConnectionConfig**) + pg_malloc0(bdr_connection_config_count * sizeof(BdrConnectionConfig*)); + + for (connection_config_idx = 0; connection_config_idx < connection_count; connection_config_idx++) + { + char *name = (char *) connames[connection_config_idx]; + char *optname_dsn = pg_malloc(strlen(name) + 30); + char *optname_replica = pg_malloc(strlen(name) + 30); + char *optname_local_dbname = pg_malloc(strlen(name) + 30); + BdrConnectionConfig *opts; + + sprintf(optname_dsn, "bdr.%s_dsn", name); + sprintf(optname_replica, "bdr.%s_init_replica", name); + sprintf(optname_local_dbname, "bdr.%s_local_dbname", name); + + opts = pg_malloc0(sizeof(BdrConnectionConfig)); + opts->name = pg_strdup(name); + opts->is_valid = false; + + bdr_connection_configs[connection_config_idx] = opts; + + opts->dsn = get_postgres_guc_value(optname_dsn, NULL); + if (!opts->dsn) + continue; + + if (!parse_bool(get_postgres_guc_value(optname_replica, "false"), &opts->init_replica)) + die(_("Invalid boolean value for configuration parameter \"%s\"\n"), optname_replica); + + opts->dbname = get_postgres_guc_value(optname_local_dbname, NULL); + + options = PQconninfoParse(opts->dsn, &errormsg); + if (errormsg != NULL) + { + char *str = pg_strdup(errormsg); + + PQfreemem(errormsg); + die(_("bdr %s: error in dsn: %s"), name, str); + } + + if (opts->dbname == NULL) + { + cur_option = options; + while (cur_option->keyword != NULL) + { + if (strcmp(cur_option->keyword, "dbname") == 0) + { + if (cur_option->val == NULL) + die(_("bdr %s: no dbname set"), name); + + opts->dbname = pg_strdup(cur_option->val); + } + cur_option++; + } + } + + opts->is_valid = true; + + /* cleanup */ + PQconninfoFree(options); + } +} + + + +/* + * Cleans everything that was replicated via basebackup but we don't want it. + */ +static void +remove_unwanted_state(void) +{ + DIR *lldir; + struct dirent *llde; + PQExpBuffer llpath = createPQExpBuffer(); + PQExpBuffer filename = createPQExpBuffer(); + + printfPQExpBuffer(llpath, "%s/%s", data_dir, LLOGCDIR); + + /* + * Remove stray logical replication checkpoints + */ + lldir = opendir(llpath->data); + if (lldir == NULL) + { + die(_("Could not open directory \"%s\": %s\n"), + llpath->data, strerror(errno)); + } + + while (errno = 0, (llde = readdir(lldir)) != NULL) + { + size_t len = strlen(llde->d_name); + if (len > 5 && !strcmp(llde->d_name + len - 5, ".ckpt")) + { + printfPQExpBuffer(filename, "%s/%s", llpath->data, llde->d_name); + + if (unlink(filename->data) != 0) + { + die(_("Could not unlink checkpoint file \"%s\": %s\n"), + filename->data, strerror(errno)); + } + } + } + + destroyPQExpBuffer(llpath); + destroyPQExpBuffer(filename); + + if (errno) + { + die(_("Could not read directory \"%s\": %s\n"), + LLOGCDIR, strerror(errno)); + } + + if (closedir(lldir)) + { + die(_("Could not close directory \"%s\": %s\n"), + LLOGCDIR, strerror(errno)); + } +} + + +/* + * Initialize replication slots + * + * Get connection configs from bdr and use the info + * to register replication slots for future use. + */ +static void +initialize_replication_slots(bool init_replica) +{ + int i; + + for (i = 0; i < bdr_connection_config_count; i++) + { + NameData slot_name; + char remote_ident[256]; + RemoteInfo *ri; + TimeLineID tlid; + Oid dboid; + char system_identifier_s[32]; + BdrConnectionConfig *cfg = bdr_connection_configs[i]; + PQExpBuffer conninfo = createPQExpBuffer(); + + if (!cfg || !cfg->is_valid || cfg->init_replica != init_replica) + continue; + + printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn); + remote_conn = PQconnectdb(conninfo->data); + destroyPQExpBuffer(conninfo); + + if (PQstatus(remote_conn) != CONNECTION_OK) + { + die(_("Could not connect to the remote server: %s"), + PQerrorMessage(remote_conn)); + } + + ri = get_remote_info(remote_conn); + dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname); + + /* XXX: this might break if timeline switch happens in meantime */ + tlid = cfg->init_replica ? ri->tlid + 1 : ri->tlid; + + snprintf(system_identifier_s, sizeof(system_identifier_s), UINT64_FORMAT, system_identifier); + snprintf(NameStr(slot_name), NAMEDATALEN, BDR_SLOT_NAME_FORMAT, + ri->dboid, system_identifier_s, tlid, + dboid, ""); + NameStr(slot_name)[NAMEDATALEN - 1] = '\0'; + + create_replication_slot(remote_conn, &slot_name); + + PQfinish(remote_conn); + remote_conn = NULL; + + snprintf(remote_ident, sizeof(remote_ident), + BDR_NODE_ID_FORMAT, + ri->sysid, ri->tlid, ri->dboid, dboid, + ""); + } +} + +/* + * Read replication info about remote connection + */ +static RemoteInfo * +get_remote_info(PGconn *conn) +{ + RemoteInfo *ri = (RemoteInfo *)pg_malloc(sizeof(RemoteInfo)); + char *remote_tlid; + char *remote_dboid; + PGresult *res; + + res = PQexec(conn, "IDENTIFY_SYSTEM"); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + PQclear(res); + die(_("Could not send replication command \"%s\": %s"), + "IDENTIFY_SYSTEM", PQerrorMessage(conn)); + } + + if (PQntuples(res) != 1 || PQnfields(res) != 5) + { + PQclear(res); + die(_("Could not identify system: got %d rows and %d fields, expected %d rows and %d fields\n"), + PQntuples(res), PQnfields(res), 1, 5); + } + +#ifdef HAVE_STRTOULL + ri->sysid = strtoull(PQgetvalue(res, 0, 0), NULL, 10); +#else + ri->sysid = strtoul(PQgetvalue(res, 0, 0), NULL, 10); +#endif + + remote_tlid = PQgetvalue(res, 0, 1); + if (sscanf(remote_tlid, "%u", &ri->tlid) != 1) + die(_("Could not parse remote tlid %s"), remote_tlid); + + remote_dboid = PQgetvalue(res, 0, 4); + if (sscanf(remote_dboid, "%u", &ri->dboid) != 1) + die(_("Could not parse remote database OID %s"), remote_dboid); + + PQclear(res); + + return ri; +} + +/* + * Get dboid based on dbname + */ +static Oid +get_dboid_from_dbname(PGconn *conn, const char* dbname) +{ + char *dboid_str; + Oid dboid; + PQExpBuffer query = createPQExpBuffer(); + PGresult *res; + + appendPQExpBuffer(query, "SELECT oid FROM pg_catalog.pg_database WHERE datname = '%s'", + dbname); + + res = PQexec(conn, query->data); + if (PQresultStatus(res) != PGRES_TUPLES_OK || PQntuples(res) != 1) + { + PQclear(res); + die(_("Could not get database id for \"%s\": %s"), + dbname, PQerrorMessage(conn)); + } + + dboid_str = PQgetvalue(res, 0, 0); + if (sscanf(dboid_str, "%u", &dboid) != 1) + die(_("Could not parse database OID %s"), dboid_str); + + PQclear(res); + destroyPQExpBuffer(query); + + return dboid; +} + +/* + * Create replication slot + */ +static void +create_replication_slot(PGconn *conn, Name slot_name) +{ + PQExpBuffer query = createPQExpBuffer(); + PGresult *res; + + appendPQExpBuffer(query, "CREATE_REPLICATION_SLOT \"%s\" LOGICAL %s", + NameStr(*slot_name), "bdr_output"); + + res = PQexec(conn, query->data); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + die(_("Could not send replication command \"%s\": status %s: %s\n"), + query->data, + PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res)); + } + + PQclear(res); + destroyPQExpBuffer(query); +} + + +/* + * Initialize new remote identifiers to specific position. + */ +static void +initialize_replication_identifiers(char *remote_lsn) +{ + int i; + PGresult *res; + + /* Remove replication identifiers */ + res = PQexec(local_conn, "DELETE FROM pg_catalog.pg_replication_identifier;"); + if (PQresultStatus(res) != PGRES_COMMAND_OK) + { + PQclear(res); + die(_("Could not remove replication identifier: %s"), PQerrorMessage(local_conn)); + } + + /* Initialize new replication identifiers */ + for (i = 0; i < bdr_connection_config_count; i++) + { + char remote_ident[256]; + Oid dboid; + RemoteInfo *ri; + BdrConnectionConfig *cfg = bdr_connection_configs[i]; + PQExpBuffer conninfo = createPQExpBuffer(); + + if (!cfg || !cfg->is_valid) + continue; + + printfPQExpBuffer(conninfo, "%s replication=database", cfg->dsn); + remote_conn = PQconnectdb(conninfo->data); + destroyPQExpBuffer(conninfo); + + if (PQstatus(remote_conn) != CONNECTION_OK) + { + die(_("Could not connect to the remote server: %s"), + PQerrorMessage(remote_conn)); + } + + ri = get_remote_info(remote_conn); + dboid = cfg->init_replica ? ri->dboid : get_dboid_from_dbname(local_conn, cfg->dbname); + + PQfinish(remote_conn); + remote_conn = NULL; + + snprintf(remote_ident, sizeof(remote_ident), + BDR_NODE_ID_FORMAT, + ri->sysid, ri->tlid, ri->dboid, dboid, + ""); + + create_replication_identifier(local_conn, remote_ident, + cfg->init_replica ? remote_lsn : NULL); + } +} + +/* + * Create local replication identifier + */ +static void +create_replication_identifier(PGconn *conn, const char *remote_ident, char *remote_lsn) +{ + PQExpBuffer query = createPQExpBuffer(); + PGresult *res; + + printfPQExpBuffer(query, "SELECT pg_replication_identifier_create('%s')", + remote_ident); + + res = PQexec(conn, query->data); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + die(_("Could not create replication indentifier \"%s\": status %s: %s\n"), + query->data, + PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res)); + } + PQclear(res); + + if (remote_lsn) + { + printfPQExpBuffer(query, "SELECT pg_replication_identifier_advance('%s', '%s', '0/0')", + remote_ident, remote_lsn); + + res = PQexec(conn, query->data); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + die(_("Could not advance replication indentifier \"%s\": status %s: %s\n"), + query->data, + PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res)); + } + PQclear(res); + } + + destroyPQExpBuffer(query); +} + + +/* + * Create remote restore point which will be used to get into synchronized + * state through physical replay. + */ +static char * +create_restore_point(char *remote_connstr) +{ + PQExpBuffer query = createPQExpBuffer(); + PGresult *res; + char *remote_lsn = NULL; + + remote_conn = PQconnectdb(remote_connstr); + if (PQstatus(remote_conn) != CONNECTION_OK) + { + die(_("Could not connect to the remote server: %s"), + PQerrorMessage(remote_conn)); + } + + printfPQExpBuffer(query, "SELECT pg_create_restore_point('%s')", NameStr(restore_point_name)); + res = PQexec(remote_conn, query->data); + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + die(_("Could not create restore point \"%s\": status %s: %s\n"), + query->data, + PQresStatus(PQresultStatus(res)), PQresultErrorMessage(res)); + } + remote_lsn = pstrdup(PQgetvalue(res, 0, 0)); + + PQclear(res); + PQfinish(remote_conn); + remote_conn = NULL; + destroyPQExpBuffer(query); + + return remote_lsn; +} + + +static char * +detect_remote_conninfo(void) +{ + int i; + + for (i = 0; i < bdr_connection_config_count; i++) + { + BdrConnectionConfig *cfg = bdr_connection_configs[i]; + + if (!cfg || !cfg->is_valid || !cfg->init_replica) + continue; + + return pg_strdup(cfg->dsn); + } + + return NULL; +} + +char * +get_conninfo(char *dbname, char *dbhost, char *dbport, char *dbuser) +{ + char *ret; + int argcount = 4; /* dbname, host, user, port, password */ + int i; + const char **keywords; + const char **values; + PQconninfoOption *conn_opts = NULL; + PQconninfoOption *conn_opt; + char *err_msg = NULL; + + /* + * Merge the connection info inputs given in form of connection string + * and options + */ + i = 0; + if (dbname) + { + conn_opts = PQconninfoParse(dbname, &err_msg); + if (conn_opts == NULL) + { + die(_("Invalid connection string: %s"), err_msg); + } + + for (conn_opt = conn_opts; conn_opt->keyword != NULL; conn_opt++) + { + if (conn_opt->val != NULL && conn_opt->val[0] != '\0') + argcount++; + } + + keywords = pg_malloc0((argcount + 1) * sizeof(*keywords)); + values = pg_malloc0((argcount + 1) * sizeof(*values)); + + for (conn_opt = conn_opts; conn_opt->keyword != NULL; conn_opt++) + { + if (conn_opt->val != NULL && conn_opt->val[0] != '\0') + { + keywords[i] = conn_opt->keyword; + values[i] = conn_opt->val; + i++; + } + } + } + else + { + keywords = pg_malloc0((argcount + 1) * sizeof(*keywords)); + values = pg_malloc0((argcount + 1) * sizeof(*values)); + } + + keywords[i] = "dbname"; + values[i] = dbname == NULL ? "postgres" : dbname; + i++; + + if (dbhost) + { + keywords[i] = "host"; + values[i] = dbhost; + i++; + } + if (dbuser) + { + keywords[i] = "user"; + values[i] = dbuser; + i++; + } + if (dbport) + { + keywords[i] = "port"; + values[i] = dbport; + i++; + } + + ret = PQconninfoParams_to_conninfo(keywords, values); + + /* Connection ok! */ + pg_free(values); + pg_free(keywords); + if (conn_opts) + PQconninfoFree(conn_opts); + + return ret; +} + + +/* + * Create a new unique installation identifier. + * + * See notes in xlog.c about the algorithm. + * + * XXX: how to reuse the code between xlog.c, pg_resetxlog.c and this file + */ +static uint64 +GenerateSystemIdentifier(void) +{ + uint64 sysidentifier; + struct timeval tv; + + gettimeofday(&tv, NULL); + sysidentifier = ((uint64) tv.tv_sec) << 32; + sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier |= getpid() & 0xFFF; + + return sysidentifier; +} + +/* + * Write contents of recovery.conf + */ +static void +WriteRecoveryConf(PQExpBuffer contents) +{ + char filename[MAXPGPATH]; + FILE *cf; + + sprintf(filename, "%s/recovery.conf", data_dir); + + cf = fopen(filename, "w"); + if (cf == NULL) + { + die(_("%s: could not create file \"%s\": %s\n"), progname, filename, strerror(errno)); + } + + if (fwrite(contents->data, contents->len, 1, cf) != 1) + { + die(_("%s: could not write to file \"%s\": %s\n"), + progname, filename, strerror(errno)); + } + + fclose(cf); +} + +/* + * Convert PQconninfoOption array into conninfo string + */ +static char * +PQconninfoParams_to_conninfo(const char *const * keywords, const char *const * values) +{ + PQExpBuffer retbuf = createPQExpBuffer(); + char *ret; + int i = 0; + + while (keywords[i]) + { + char *tmpval = escapeConninfoValue(values[i]); + appendPQExpBuffer(retbuf, "%s = '%s' ", keywords[i], tmpval); + pg_free(tmpval); + i++; + } + + ret = pg_strdup(retbuf->data); + destroyPQExpBuffer(retbuf); + + return ret; +} + +/* + * Escape connection info value + */ +static char * +escapeConninfoValue(const char *val) +{ + int i, j; + char *ret = pg_malloc(strlen(val) * 2 + 1); + + j = 0; + for (i = 0; i < strlen(val); i++) + { + switch (val[i]) + { + case '\\': + case '\'': + ret[j++] = '\\'; + default: + break; + } + + ret[j++] = val[i]; + } + + ret[j] = '\0'; + + return ret; +} + + +/* + * Taken from adt/bool.c + * + * Try to interpret value as boolean value. Valid values are: true, + * false, yes, no, on, off, 1, 0; as well as unique prefixes thereof. + * If the string parses okay, return true, else false. + * If okay and result is not NULL, return the value in *result. + */ +static bool +parse_bool(const char *value, bool *result) +{ + return parse_bool_with_len(value, strlen(value), result); +} + +static bool +parse_bool_with_len(const char *value, size_t len, bool *result) +{ + switch (*value) + { + case 't': + case 'T': + if (pg_strncasecmp(value, "true", len) == 0) + { + if (result) + *result = true; + return true; + } + break; + case 'f': + case 'F': + if (pg_strncasecmp(value, "false", len) == 0) + { + if (result) + *result = false; + return true; + } + break; + case 'y': + case 'Y': + if (pg_strncasecmp(value, "yes", len) == 0) + { + if (result) + *result = true; + return true; + } + break; + case 'n': + case 'N': + if (pg_strncasecmp(value, "no", len) == 0) + { + if (result) + *result = false; + return true; + } + break; + case 'o': + case 'O': + /* 'o' is not unique enough */ + if (pg_strncasecmp(value, "on", (len > 2 ? len : 2)) == 0) + { + if (result) + *result = true; + return true; + } + else if (pg_strncasecmp(value, "off", (len > 2 ? len : 2)) == 0) + { + if (result) + *result = false; + return true; + } + break; + case '1': + if (len == 1) + { + if (result) + *result = true; + return true; + } + break; + case '0': + if (len == 1) + { + if (result) + *result = false; + return true; + } + break; + default: + break; + } + + if (result) + *result = false; /* suppress compiler warning */ + return false; +} + +/* + * Remove leading and trailing whitespace from the string, + * does not change input + */ +static char * +trimwhitespace(char *str) +{ + char *end; + char *res; + size_t len; + + while(isspace(*str)) + str++; + + if(*str == 0) + return NULL; + + end = str + strlen(str) - 1; + while(end > str && isspace(*end)) + end--; + + len = end-str; + if (!len) + return NULL; + + len++; + res = pg_malloc(len); + memcpy(res, str, len); + res[len] = '\0'; + + return res; +} + +/* + * Split guc list paramenter into array + * Note that this is not 100% compatible with that is in core + * but seems good enough for our purposes + */ +static char ** +split_list_guc(char *str, size_t *count) +{ + char **ret = NULL; + char *t = strtok (str, ","); + size_t i = 0; + + while (t) { + ret = realloc(ret, sizeof(char*)* ++i); + + if (ret == NULL) + die(_("Out of memory")); + + t = trimwhitespace(t); + if (!t) + die(_("Bad input for list: %s"), str); + + ret[i-1] = t; + + t = strtok(NULL, ","); + } + + *count = i; + return ret; +} + + +/* + * Find the pgport and try a connection + * + * Based on pg_ctl.c:test_postmaster_connection + */ +static void +wait_postmaster_connection(char **out_connstr) +{ + PGPing res; + long pm_pid = 0; + char connstr[MAXPGPATH * 2 + 256]; + + *out_connstr = NULL; + connstr[0] = '\0'; + + for (;;) + { + /* Do we need a connection string? */ + if (connstr[0] == '\0') + { + /*---------- + * The number of lines in postmaster.pid tells us several things: + * + * # of lines + * 0 lock file created but status not written + * 2 pre-9.1 server, shared memory not created + * 3 pre-9.1 server, shared memory created + * 5 9.1+ server, ports not opened + * 6 9.1+ server, shared memory not created + * 7 9.1+ server, shared memory created + * + * If we see less than 6 lines in postmaster.pid, just keep + * waiting. + *---------- + */ + char **optlines; + + /* Try to read the postmaster.pid file */ + if ((optlines = readfile(pid_file)) != NULL && + optlines[0] != NULL && + optlines[1] != NULL && + optlines[2] != NULL && + optlines[3] != NULL && + optlines[4] != NULL && + optlines[5] != NULL) + { + /* File is complete enough for us, parse it */ + long pmpid; + time_t pmstart; + + /* + * Make sanity checks. If it's for a standalone backend + * (negative PID), or the recorded start time is before + * pg_ctl started, then either we are looking at the wrong + * data directory, or this is a pre-existing pidfile that + * hasn't (yet?) been overwritten by our child postmaster. + * Allow 2 seconds slop for possible cross-process clock + * skew. + */ + pmpid = atol(optlines[LOCK_FILE_LINE_PID - 1]); + pmstart = atol(optlines[LOCK_FILE_LINE_START_TIME - 1]); + if (pmpid > 0 || pmstart > start_time - 3) + { + /* + * OK, seems to be a valid pidfile from our child. + */ + int portnum; + char *sockdir; + char *hostaddr; + char host_str[MAXPGPATH]; + + pm_pid = pmpid; + + /* + * Extract port number and host string to use. Prefer + * using Unix socket if available. + */ + portnum = atoi(optlines[LOCK_FILE_LINE_PORT - 1]); + sockdir = optlines[LOCK_FILE_LINE_SOCKET_DIR - 1]; + hostaddr = optlines[LOCK_FILE_LINE_LISTEN_ADDR - 1]; + + /* + * While unix_socket_directories can accept relative + * directories, libpq's host parameter must have a + * leading slash to indicate a socket directory. So, + * ignore sockdir if it's relative, and try to use TCP + * instead. + */ + if (sockdir[0] == '/') + strlcpy(host_str, sockdir, sizeof(host_str)); + else + strlcpy(host_str, hostaddr, sizeof(host_str)); + + /* remove trailing newline */ + if (strchr(host_str, '\n') != NULL) + *strchr(host_str, '\n') = '\0'; + + /* Fail if couldn't get either sockdir or host addr */ + if (host_str[0] == '\0') + { + fprintf(stderr, _("Relative socket directory is not supported\n")); + return; + } + + /* If postmaster is listening on "*", use localhost */ + if (strcmp(host_str, "*") == 0) + strcpy(host_str, "localhost"); + + /* + * We need to set connect_timeout otherwise on Windows + * the Service Control Manager (SCM) will probably + * timeout first. + */ + snprintf(connstr, sizeof(connstr), + "dbname=postgres port=%d host='%s' connect_timeout=5", + portnum, host_str); + } + } + + /* + * Free the results of readfile. + * + * This is safe to call even if optlines is NULL. + */ + free_readfile(optlines); + } + + /* If we have a connection string, ping the server */ + if (connstr[0] != '\0') + { + res = PQping(connstr); + if (res == PQPING_OK) + { + *out_connstr = connstr; + break; + } + else if (res == PQPING_NO_ATTEMPT) + break; + } + + /* + * If we've been able to identify the child postmaster's PID, check + * the process is still alive. This covers cases where the postmaster + * successfully created the pidfile but then crashed without removing + * it. + */ + if (pm_pid > 0 && !postmaster_is_alive((pid_t) pm_pid)) + return; + + /* No response, or startup still in process; wait */ + pg_usleep(1000000); /* 1 sec */ + print_msg("."); + } +} + +/* + * Wait for postmaster to die + */ +static void +wait_postgres_shutdown(void) +{ + long pid; + + for (;;) + { + if ((pid = get_pgpid()) != 0) + { + pg_usleep(1000000); /* 1 sec */ + print_msg("."); + } + else + break; + } +} + +static bool +is_pg_dir(char *path) +{ + struct stat statbuf; + char version_file[MAXPGPATH]; + + if (stat(path, &statbuf) != 0) + return false; + + snprintf(version_file, MAXPGPATH, "%s/PG_VERSION", data_dir); + if (stat(version_file, &statbuf) != 0 && errno == ENOENT) + { + return false; + } + + return true; +} + +/* + * Utility functions taken from pg_ctl + */ + +static char * +find_other_exec_or_die(const char *argv0, const char *target, const char *versionstr) +{ + int ret; + char *found_path; + + found_path = pg_malloc(MAXPGPATH); + + if ((ret = find_other_exec(argv0, target, versionstr, found_path)) < 0) + { + char full_path[MAXPGPATH]; + + if (find_my_exec(argv0, full_path) < 0) + strlcpy(full_path, progname, sizeof(full_path)); + + if (ret == -1) + die(_("The program \"%s\" is needed by %s " + "but was not found in the\n" + "same directory as \"%s\".\n" + "Check your installation.\n"), + target, progname, full_path); + else + die(_("The program \"%s\" was found by \"%s\"\n" + "but was not the same version as %s.\n" + "Check your installation.\n"), + target, full_path, progname); + } + + return found_path; +} + +static bool +postmaster_is_alive(pid_t pid) +{ + /* + * Test to see if the process is still there. Note that we do not + * consider an EPERM failure to mean that the process is still there; + * EPERM must mean that the given PID belongs to some other userid, and + * considering the permissions on $PGDATA, that means it's not the + * postmaster we are after. + * + * Don't believe that our own PID or parent shell's PID is the postmaster, + * either. (Windows hasn't got getppid(), though.) + */ + if (pid == getpid()) + return false; +#ifndef WIN32 + if (pid == getppid()) + return false; +#endif + if (kill(pid, 0) == 0) + return true; + return false; +} + +static long +get_pgpid(void) +{ + FILE *pidf; + long pid; + + pidf = fopen(pid_file, "r"); + if (pidf == NULL) + { + return 0; + } + if (fscanf(pidf, "%ld", &pid) != 1) + { + return 0; + } + fclose(pidf); + return pid; +} + +/* + * get the lines from a text file - return NULL if file can't be opened + */ +static char ** +readfile(const char *path) +{ + int fd; + int nlines; + char **result; + char *buffer; + char *linebegin; + int i; + int n; + int len; + struct stat statbuf; + + /* + * Slurp the file into memory. + * + * The file can change concurrently, so we read the whole file into memory + * with a single read() call. That's not guaranteed to get an atomic + * snapshot, but in practice, for a small file, it's close enough for the + * current use. + */ + fd = open(path, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + return NULL; + if (fstat(fd, &statbuf) < 0) + { + close(fd); + return NULL; + } + if (statbuf.st_size == 0) + { + /* empty file */ + close(fd); + result = (char **) pg_malloc(sizeof(char *)); + *result = NULL; + return result; + } + buffer = pg_malloc(statbuf.st_size + 1); + + len = read(fd, buffer, statbuf.st_size + 1); + close(fd); + if (len != statbuf.st_size) + { + /* oops, the file size changed between fstat and read */ + free(buffer); + return NULL; + } + + /* + * Count newlines. We expect there to be a newline after each full line, + * including one at the end of file. If there isn't a newline at the end, + * any characters after the last newline will be ignored. + */ + nlines = 0; + for (i = 0; i < len; i++) + { + if (buffer[i] == '\n') + nlines++; + } + + /* set up the result buffer */ + result = (char **) pg_malloc((nlines + 1) * sizeof(char *)); + + /* now split the buffer into lines */ + linebegin = buffer; + n = 0; + for (i = 0; i < len; i++) + { + if (buffer[i] == '\n') + { + int slen = &buffer[i] - linebegin + 1; + char *linebuf = pg_malloc(slen + 1); + + memcpy(linebuf, linebegin, slen); + linebuf[slen] = '\0'; + result[n++] = linebuf; + linebegin = &buffer[i + 1]; + } + } + result[n] = NULL; + + free(buffer); + + return result; +} + +/* + * Free memory allocated for optlines through readfile() + */ +void +free_readfile(char **optlines) +{ + char *curr_line = NULL; + int i = 0; + + if (!optlines) + return; + + while ((curr_line = optlines[i++])) + free(curr_line); + + free(optlines); + + return; +} diff --git a/contrib/bdr/bdr_init_copy.mk b/contrib/bdr/bdr_init_copy.mk new file mode 100644 index 0000000000..da12e74b4e --- /dev/null +++ b/contrib/bdr/bdr_init_copy.mk @@ -0,0 +1,24 @@ +subdir = contrib/bdr +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global + +OBJS= bdr_init_copy.o +override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) + +all: bdr_init_copy + +bdr_init_copy: $(OBJS) | submake-libpq submake-libpgport + $(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(libpq_pgport) $(LIBS) -o $@$(X) + +install: all installdirs + $(INSTALL_PROGRAM) bdr_init_copy$(X) '$(DESTDIR)$(bindir)/bdr_init_copy$(X)' + +installdirs: + $(MKDIR_P) '$(DESTDIR)$(bindir)' + +uninstall: + rm -f '$(DESTDIR)$(bindir)/bdr_init_copy$(X)' + +clean distclean maintainer-clean: + rm -f bdr_init_copy$(X) $(OBJS) + diff --git a/contrib/bdr/bdr_init_replica.c b/contrib/bdr/bdr_init_replica.c index 1d6d1e7e49..0515cda007 100644 --- a/contrib/bdr/bdr_init_replica.c +++ b/contrib/bdr/bdr_init_replica.c @@ -54,6 +54,7 @@ #include "utils/syscache.h" char *bdr_temp_dump_directory = NULL; +bool bdr_init_from_basedump = false; static void bdr_exec_init_replica(BdrConnectionConfig *cfg, char *snapshot); @@ -822,61 +823,69 @@ bdr_init_replica(Name dbname) /* Get the bdr.bdr_nodes status field for our node id from the remote */ status = bdr_get_remote_status(nonrepl_init_conn); - switch (status) + + if (bdr_init_from_basedump) { - case '\0': - elog(DEBUG2, "bdr %s: initializing from clean state", - NameStr(*dbname)); - break; + status = bdr_set_remote_status(nonrepl_init_conn, 'c', status); + } + else + { + switch (status) + { + case '\0': + elog(DEBUG2, "bdr %s: initializing from clean state", + NameStr(*dbname)); + break; - case 'r': - /* - * Init has been completed, but we didn't check our local - * bdr.bdr_nodes, or the final update hasn't propagated yet. - * - * All we need to do is catch up, we already replayed enough to be - * consistent and start up in normal mode last time around - */ - elog(DEBUG2, "bdr %s: init already completed, nothing to do", - NameStr(*dbname)); - return; + case 'r': + /* + * Init has been completed, but we didn't check our local + * bdr.bdr_nodes, or the final update hasn't propagated yet. + * + * All we need to do is catch up, we already replayed enough to be + * consistent and start up in normal mode last time around + */ + elog(DEBUG2, "bdr %s: init already completed, nothing to do", + NameStr(*dbname)); + return; - case 'c': - /* - * We were in catchup mode when we died. We need to resume catchup - * mode up to the expected LSN before switching over. - * - * To do that all we need to do is fall through without doing any - * slot re-creation, dump/apply, etc, and pick up when we do - * catchup. - * - * We won't know what the original catchup target point is, but we - * can just catch up to whatever xlog position the server is - * currently at. - */ - elog(DEBUG2, "bdr %s: dump applied, need to continue catchup", - NameStr(*dbname)); - break; + case 'c': + /* + * We were in catchup mode when we died. We need to resume catchup + * mode up to the expected LSN before switching over. + * + * To do that all we need to do is fall through without doing any + * slot re-creation, dump/apply, etc, and pick up when we do + * catchup. + * + * We won't know what the original catchup target point is, but we + * can just catch up to whatever xlog position the server is + * currently at. + */ + elog(DEBUG2, "bdr %s: dump applied, need to continue catchup", + NameStr(*dbname)); + break; - case 'i': - /* - * A previous init attempt seems to have failed. Clean up, then - * fall through to start setup again. - * - * We can't just re-use the slot and replication identifier that - * were created last time (if they were), because we have no way - * of getting the slot's exported snapshot after - * CREATE_REPLICATION_SLOT. - */ - elog(DEBUG2, "bdr %s: previous failed initalization detected, cleaning up", - NameStr(*dbname)); - bdr_drop_slot_and_replication_identifier(init_replica_config); - status = bdr_set_remote_status(nonrepl_init_conn, '\0', status); - break; - - default: - elog(ERROR, "unreachable"); /* Unhandled case */ - break; + case 'i': + /* + * A previous init attempt seems to have failed. Clean up, then + * fall through to start setup again. + * + * We can't just re-use the slot and replication identifier that + * were created last time (if they were), because we have no way + * of getting the slot's exported snapshot after + * CREATE_REPLICATION_SLOT. + */ + elog(DEBUG2, "bdr %s: previous failed initalization detected, cleaning up", + NameStr(*dbname)); + bdr_drop_slot_and_replication_identifier(init_replica_config); + status = bdr_set_remote_status(nonrepl_init_conn, '\0', status); + break; + + default: + elog(ERROR, "unreachable"); /* Unhandled case */ + break; + } } if (status == '\0') diff --git a/contrib/bdr/bdr_internal.h b/contrib/bdr/bdr_internal.h new file mode 100644 index 0000000000..0d70cea023 --- /dev/null +++ b/contrib/bdr/bdr_internal.h @@ -0,0 +1,33 @@ +/* + * bdr.h + * + * BiDirectionalReplication + * + * Copyright (c) 2012-2013, PostgreSQL Global Development Group + * + * contrib/bdr/bdr.h + */ +#ifndef BDR_INTERNAL_H +#define BDR_INTERNAL_H + +#define BDR_SLOT_NAME_FORMAT "bdr_%u_%s_%u_%u__%s" +#define BDR_NODE_ID_FORMAT "bdr_"UINT64_FORMAT"_%u_%u_%u_%s" + +/* GUC storage for a configured BDR connection. */ +typedef struct BdrConnectionConfig +{ + char *dsn; + int apply_delay; + bool init_replica; + char *replica_local_dsn; + /* + * These aren't technically GUCs, but are per-connection config + * information obtained from the GUCs. + */ + char *name; + char *dbname; + /* Connection config might be broken (blank dsn, etc) */ + bool is_valid; +} BdrConnectionConfig; + +#endif /* BDR_INTERNAL_H */ diff --git a/src/backend/replication/logical/replication_identifier.c b/src/backend/replication/logical/replication_identifier.c index ebc9c06139..60661a9e24 100644 --- a/src/backend/replication/logical/replication_identifier.c +++ b/src/backend/replication/logical/replication_identifier.c @@ -450,6 +450,52 @@ pg_get_replication_identifier_progress(PG_FUNCTION_ARGS) return (Datum) 0; } +Datum +pg_replication_identifier_advance(PG_FUNCTION_ARGS) +{ + text *name = PG_GETARG_TEXT_P(0); + text *remote_lsn = PG_GETARG_TEXT_P(1); + text *local_lsn = PG_GETARG_TEXT_P(2); + char *remote_lsn_str; + char *local_lsn_str; + XLogRecPtr remote_commit; + XLogRecPtr local_commit; + RepNodeId node; + uint32 rhi, + lhi, + rlo, + llo; + + CheckReplicationIdentifierPrerequisites(true); + + node = GetReplicationIdentifier(text_to_cstring(name), false); + + remote_lsn_str = text_to_cstring(remote_lsn); + local_lsn_str = text_to_cstring(local_lsn); + + if (sscanf(remote_lsn_str, "%X/%X", &rhi, &rlo) != 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not parse remote_lsn \"%s\"", + remote_lsn_str))); + + if (sscanf(local_lsn_str, "%X/%X", &lhi, &llo) != 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not parse local_lsn \"%s\"", + local_lsn_str))); + + remote_commit = ((uint64) (rhi)) << 32 | (uint64) rlo; + local_commit = ((uint64) (lhi)) << 32 | (uint64) llo; + + AdvanceReplicationIdentifier(node, remote_commit, local_commit); + + pfree(remote_lsn_str); + pfree(local_lsn_str); + + PG_RETURN_VOID(); +} + Size ReplicationIdentifierShmemSize(void) { diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index ef1443bb5e..871dae7089 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -68,6 +68,7 @@ static MultiXactOffset set_mxoff = (MultiXactOffset) -1; static uint32 minXlogTli = 0; static XLogSegNo minXlogSegNo = 0; static uint64 set_sysid = 0; +static XLogRecPtr oldCheckPoint = 0; static uint64 GenerateSystemIdentifier(void); static bool ReadControlFile(void); @@ -79,6 +80,7 @@ static void FindEndOfXLOG(void); static void KillExistingXLOG(void); static void KillExistingArchiveStatus(void); static void WriteEmptyXLOG(void); +static void UpdateLogicalCheckpoints(void); static void usage(void); int @@ -374,6 +376,9 @@ main(int argc, char *argv[]) if (set_sysid != 0) ControlFile.system_identifier = set_sysid; + /* Keep the old checkpoint for later use */ + oldCheckPoint = ControlFile.checkPoint; + /* * If we had to guess anything, and -f was not given, just print the * guessed values and exit. Also print if -n is given. @@ -408,6 +413,7 @@ main(int argc, char *argv[]) KillExistingXLOG(); KillExistingArchiveStatus(); WriteEmptyXLOG(); + UpdateLogicalCheckpoints(); printf(_("Transaction log reset\n")); return 0; @@ -1106,6 +1112,64 @@ WriteEmptyXLOG(void) close(fd); } +/* + * Copy the last logical checkpoint to new name + */ +static void +UpdateLogicalCheckpoints(void) +{ + int sourcefd; + int targetfd; + char sourcepath[MAXPGPATH]; + char targetpath[MAXPGPATH]; + char buffer[8192]; + size_t len; + + /* Checkpoint didn't change (it was guessed?), return */ + if (oldCheckPoint == ControlFile.checkPoint) + return; + + sprintf(sourcepath, "pg_llog/checkpoints/%X-%X.ckpt", + (uint32)(oldCheckPoint >> 32), (uint32)oldCheckPoint); + + sprintf(targetpath, "pg_llog/checkpoints/%X-%X.ckpt", + (uint32)(ControlFile.checkPoint >> 32), (uint32)ControlFile.checkPoint); + + /* If there is no checkpoint file, we just silently return */ + if ((sourcefd = open(sourcepath, O_RDONLY | PG_BINARY, 0)) < 0) + return; + + if ((targetfd = open(targetpath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + S_IRUSR | S_IWUSR)) < 0) + { + fprintf(stderr, _("%s: could not open file \"%s\": %s\n"), + progname, targetpath, strerror(errno)); + exit(1); + } + + while ((len = read(sourcefd, buffer, sizeof(buffer))) > 0) + { + errno == 0; + if (write(targetfd, buffer, len) != len) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + fprintf(stderr, _("%s: could not write file \"%s\": %s\n"), + progname, targetpath, strerror(errno)); + exit(1); + } + } + + close(sourcefd); + + if (fsync(targetfd) != 0) + { + fprintf(stderr, _("%s: fsync error: %s\n"), progname, strerror(errno)); + exit(1); + } + close(targetfd); +} static void usage(void) diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index f305bc3574..45df5a8b50 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5085,6 +5085,9 @@ DESCR("replication identifier progress"); DATA(insert OID = 6009 ( pg_replication_identifier_is_replaying PGNSP PGUID 12 1 0 0 0 f f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_replication_identifier_is_replaying _null_ _null_ _null_ )); DESCR("is a replication identifier setup"); +DATA(insert OID = 6010 ( pg_replication_identifier_advance PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 2278 "25 25 25" _null_ _null_ _null_ _null_ pg_replication_identifier_advance _null_ _null_ _null_ )); +DESCR("advance replication itentifier to specific location"); + /* * Symbolic values for provolatile column: these indicate whether the result diff --git a/src/include/replication/replication_identifier.h b/src/include/replication/replication_identifier.h index 3dfe5935cc..bd07abb89a 100644 --- a/src/include/replication/replication_identifier.h +++ b/src/include/replication/replication_identifier.h @@ -42,5 +42,6 @@ extern Datum pg_replication_identifier_reset_replaying_from(PG_FUNCTION_ARGS); extern Datum pg_replication_identifier_is_replaying(PG_FUNCTION_ARGS); extern Datum pg_replication_identifier_setup_tx_origin(PG_FUNCTION_ARGS); extern Datum pg_get_replication_identifier_progress(PG_FUNCTION_ARGS); +extern Datum pg_replication_identifier_advance(PG_FUNCTION_ARGS); #endif