From: Jeff Layton <jlayton(a)redhat.com>
Add a new command-line tool for manipulating and querying the rados_grace
database. It will dump out the state of the epochs and then a list of
omap keys and their flags.
Change-Id: If57591265ce736cdcebab749651d5ab6982341d8
Signed-off-by: Jeff Layton <jlayton(a)redhat.com>
---
src/doc/man/CMakeLists.txt | 5 +
src/doc/man/rados_grace_tool.rst | 108 ++++++++++++++++
src/nfs-ganesha.spec-in.cmake | 3 +
src/tools/CMakeLists.txt | 4 +
src/tools/rados_grace_tool.c | 203 +++++++++++++++++++++++++++++++
5 files changed, 323 insertions(+)
create mode 100644 src/doc/man/rados_grace_tool.rst
create mode 100644 src/tools/rados_grace_tool.c
diff --git a/src/doc/man/CMakeLists.txt b/src/doc/man/CMakeLists.txt
index 4eb42b292c1d..3f28590e00fa 100644
--- a/src/doc/man/CMakeLists.txt
+++ b/src/doc/man/CMakeLists.txt
@@ -52,6 +52,11 @@ if(USE_FSAL_GPFS)
ganesha-gpfs-config.rst)
endif()
+if(USE_RADOS_RECOV)
+ list(APPEND man_srcs
+ rados_grace_tool.rst)
+endif()
+
foreach(man ${man_srcs})
list(APPEND sphinx_input ${CMAKE_CURRENT_SOURCE_DIR}/${man})
string(REGEX REPLACE ".rst$" "" cmd ${man})
diff --git a/src/doc/man/rados_grace_tool.rst b/src/doc/man/rados_grace_tool.rst
new file mode 100644
index 000000000000..c4b569fd0342
--- /dev/null
+++ b/src/doc/man/rados_grace_tool.rst
@@ -0,0 +1,108 @@
+===================================================================
+rados_grace_tool -- manipulate the shared grace management database
+===================================================================
+
+SYNOPSIS
+===================================================================
+
+| rados_grace_tool [ --pool pool_id ] [ --name obj_id ]
dump|start|join|lift|remove|enforce|noenforce [ hostname ... ]
+
+DESCRIPTION
+===================================================================
+
+This tool allows the administrator to directly manipulate the database
+used by the rados_cluster recovery backend. Cluster nodes use that database to
+indicate their current state in order to coordinate a cluster-wide grace
+period.
+
+The first argument should be a command to execute against the database.
+Any remaining arguments represent the hostnames of nodes in the cluster
+that should be acted upon.
+
+OPTIONS
+===================================================================
+**--pool**
+
+Set the RADOS poolid in which the grace database object resides
+
+**--name**
+
+Set the name of the grace database RADOS object
+
+COMMANDS
+===================================================================
+
+**dump**
+
+Dump the current status of the grace period database to stdout. This
+will show the current and recovery epoch serial numbers, as well as a
+list of hosts currently in the cluster and what flags they have set
+in their individual records.
+
+**start**
+
+Start a new grace period. This will begin a new grace period in the
+cluster if one is not already active and set the record for the listed
+cluster hosts as both needing a grace period and enforcing the grace
+period. If a grace period is already active, then this is equivalent
+to **join**.
+
+**join**
+
+Attempt to join an existing grace period. This works like **start**, but
+only if there is already an existing grace period in force.
+
+**lift**
+
+Attempt to lift the current grace period. This will clear the need grace
+flags for the listed hosts. If there are no more hosts in the cluster
+that require a grace period, then it will be fully lifted and the cluster
+will transition to normal operations.
+
+**remove**
+
+Remove one or more existing hosts from the cluster. This will remove the
+listed hosts from the grace database, possibly lifting the current grace
+period if there are no more hosts that need one.
+
+**enforce**
+
+Set the flag for the given hosts that indicates that they are currently
+enforcing the grace period; not allowing the acquisition of new state by
+clients.
+
+**noenforce**
+
+Clear the enforcing flag for the given hosts, meaning that those hosts
+are now allowing clients to acquire new state.
+
+STARTING A NEW CLUSTER FROM A DOWN STATE
+===================================================================
+Because cluster nodes will attempt to lift the grace period as soon as
+no one needs it, it's best to start the grace period before bringing up any
+nodes with an initial set of hosts that will be present. This ensures that
+the grace period won't be lifted before all of the hosts have joined the
+cluster.
+
+Assuming that the nodes in our cluster are called host1 through host3:
+
+ **rados_grace_tool start host1 host2 host3**
+
+That will begin a new cluster-wide grace period, and add/update records for
+all three hosts to indicate that they need the grace period and are
+currently enforcing. With those records in place, the grace period can't
+be lifted until they have all ended their local recovery periods.
+
+After this point, new nodes can then join the cluster as needed. Cluster
+nodes can more or less do so on their own so no special steps should be
+needed to bring new nodes into an already-running cluster.
+
+REMOVING A NODE FROM THE CLUSTER
+===================================================================
+To remove a node from the cluster, take it down and then execute the remove
+command with the hosts to be removed:
+
+ **rados_grace_tool remove host2 host3**
+
+This will remove the node's record from the database, and possibly lift the
+current grace period if the listed hosts were the last ones to need it.
diff --git a/src/nfs-ganesha.spec-in.cmake b/src/nfs-ganesha.spec-in.cmake
index 351aeb2d045c..dcbbb9ae27c6 100644
--- a/src/nfs-ganesha.spec-in.cmake
+++ b/src/nfs-ganesha.spec-in.cmake
@@ -512,6 +512,7 @@ install -m 644 config_samples/xfs.conf
%{buildroot}%{_sysconfdir}/ganesha
%if %{with ceph}
install -m 644 config_samples/ceph.conf %{buildroot}%{_sysconfdir}/ganesha
+install -m 755 tools/rados_grace_tool %{buildroot}%{_sbindir}/rados_grace_tool
%endif
%if %{with rgw}
@@ -694,8 +695,10 @@ exit 0
%defattr(-,root,root,-)
%{_libdir}/ganesha/libfsalceph*
%config(noreplace) %{_sysconfdir}/ganesha/ceph.conf
+%{_sbindir}/rados_grace_tool
%if %{with man_page}
%{_mandir}/*/ganesha-ceph-config.8.gz
+%{_mandir}/*/rados_grace_tool.8.gz
%endif
%endif
diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt
index 7b536193568a..3856571f3ba1 100644
--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (USE_RADOS_RECOV)
+ add_executable(rados_grace_tool rados_grace_tool.c)
+ target_link_libraries(rados_grace_tool rados_grace ${RADOS_LIBRARIES})
+endif(USE_RADOS_RECOV)
########### install files ###############
diff --git a/src/tools/rados_grace_tool.c b/src/tools/rados_grace_tool.c
new file mode 100644
index 000000000000..139d916b70ad
--- /dev/null
+++ b/src/tools/rados_grace_tool.c
@@ -0,0 +1,203 @@
+/*
+ * vim:noexpandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright 2017 Red Hat, Inc. and/or its affiliates.
+ * Author: Jeff Layton <jlayton(a)redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * rados-grace: tool for managing coordinated grace period database
+ *
+ * This tool allows an administrator to make direct changes to the rados_grace
+ * database. See the rados_grace support library sources for more info about
+ * the internals.
+ */
+#include "config.h"
+#include <stdio.h>
+#include <stdint.h>
+#include <endian.h>
+#include <rados/librados.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <getopt.h>
+#include <rados_grace.h>
+
+static int
+cluster_connect(rados_ioctx_t *io_ctx, const char *pool)
+{
+ int ret;
+ rados_t clnt;
+
+ ret = rados_create(&clnt, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "rados_create: %d\n", ret);
+ return ret;
+ }
+
+ ret = rados_conf_read_file(clnt, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "rados_conf_read_file: %d\n", ret);
+ return ret;
+ }
+
+ ret = rados_connect(clnt);
+ if (ret < 0) {
+ fprintf(stderr, "rados_connect: %d\n", ret);
+ return ret;
+ }
+
+ ret = rados_pool_create(clnt, pool);
+ if (ret < 0 && ret != -EEXIST) {
+ fprintf(stderr, "rados_pool_create: %d\n", ret);
+ return ret;
+ }
+
+ ret = rados_ioctx_create(clnt, pool, io_ctx);
+ if (ret < 0) {
+ fprintf(stderr, "rados_ioctx_create: %d\n", ret);
+ return ret;
+ }
+ return 0;
+}
+
+static const struct option long_options[] = {
+ {"name", 1, NULL, 'n'},
+ {"pool", 1, NULL, 'p'},
+ {NULL, 0, NULL, 0}
+};
+
+static void usage(char * const *argv)
+{
+ fprintf(stderr,
+ "Usage:\n%s [ --pool pool_id ] [ --name obj_id ]
dump|start|join|lift|remove|enforce|noenforce [ nodeid ... ]\n",
+ argv[0]);
+}
+
+int main(int argc, char * const *argv)
+{
+ int ret, nodes = 0;
+ rados_ioctx_t io_ctx;
+ const char *cmd = "dump";
+ uint64_t cur, rec;
+ char *pool = DEFAULT_RADOS_GRACE_POOL;
+ char *name = DEFAULT_RADOS_GRACE_OID;
+ char c;
+ const char * const *nodeids;
+
+ while ((c = getopt_long(argc, argv, "n:p:", long_options,
+ NULL)) != -1) {
+ switch (c) {
+ case 'n':
+ name = optarg;
+ break;
+ case 'p':
+ pool = optarg;
+ break;
+ default:
+ usage(argv);
+ return 1;
+ }
+ }
+
+ if (argc > optind) {
+ cmd = argv[optind];
+ ++optind;
+ nodes = argc - optind;
+ nodeids = (const char * const *)&argv[optind];
+ }
+
+ ret = cluster_connect(&io_ctx, pool);
+ if (ret) {
+ fprintf(stderr, "Can't connect to cluster: %d\n", ret);
+ return 1;
+ }
+
+ ret = rados_grace_create(io_ctx, name);
+ if (ret < 0 && ret != -EEXIST) {
+ fprintf(stderr, "Can't create grace db: %d\n", ret);
+ return 1;
+ }
+
+ if (!strcmp(cmd, "dump")) {
+ ret = rados_grace_dump(io_ctx, name);
+ } else if (!strcmp(cmd, "start")) {
+ if (!nodes) {
+ fprintf(stderr, "Need at least one nodeid.\n");
+ ret = -EINVAL;
+ } else {
+ ret = rados_grace_join_bulk(io_ctx, name,
+ nodes, nodeids, &cur,
+ &rec, true);
+ }
+ } else if (!strcmp(cmd, "join")) {
+ uint64_t cur, rec;
+
+ if (!nodes) {
+ fprintf(stderr, "Need at least one nodeid.\n");
+ ret = -EINVAL;
+ } else {
+ ret = rados_grace_join_bulk(io_ctx, name,
+ nodes, nodeids, &cur,
+ &rec, false);
+ }
+ } else if (!strcmp(cmd, "lift")) {
+ if (!nodes) {
+ fprintf(stderr, "Need at least one nodeid.\n");
+ ret = -EINVAL;
+ } else {
+ ret = rados_grace_lift_bulk(io_ctx, name,
+ nodes, nodeids, &cur,
+ &rec, false);
+ }
+ } else if (!strcmp(cmd, "remove")) {
+ if (!nodes) {
+ fprintf(stderr, "Need at least one nodeid.\n");
+ ret = -EINVAL;
+ } else {
+ ret = rados_grace_lift_bulk(io_ctx, name,
+ nodes, nodeids, &cur,
+ &rec, true);
+ }
+ } else if (!strcmp(cmd, "enforce")) {
+ if (!nodes) {
+ fprintf(stderr, "Need at least one nodeid.\n");
+ ret = -EINVAL;
+ } else {
+ ret = rados_grace_enforcing_toggle(io_ctx,
+ name, nodes, nodeids,
+ &cur, &rec, true);
+ }
+ } else if (!strcmp(cmd, "noenforce")) {
+ if (!nodes) {
+ fprintf(stderr, "Need at least one nodeid.\n");
+ ret = -EINVAL;
+ } else {
+ ret = rados_grace_enforcing_toggle(io_ctx,
+ name, nodes, nodeids,
+ &cur, &rec, false);
+ }
+ } else {
+ usage(argv);
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ return 1;
+ return 0;
+}
--
2.17.0