From 32d650f9dfd64d78bd32a7d1fd231044ff1d893e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Apr 2021 14:01:24 +0930 Subject: [PATCH] lightningd: don't abort on incorrect versions, but try to re-exec. You still shouldn't do this (you could get some transient failures), but at least you have a decent chance if you reinstall over a running daemon, instead of getting confusing internal errors if message formats have changed. Changelog-Added: lightningd: we now try to restart if subdaemons are upgraded underneath us. Signed-off-by: Rusty Russell Fixes: #4346 --- contrib/pyln-testing/pyln/testing/utils.py | 3 +- lightningd/lightningd.c | 34 ++++++++++++++++++++++ lightningd/lightningd.h | 3 ++ lightningd/subd.c | 10 +++++-- tests/plugins/badopeningd.sh | 11 +++++++ tests/test_misc.py | 30 +++++++++++++++++++ 6 files changed, 87 insertions(+), 4 deletions(-) create mode 100755 tests/plugins/badopeningd.sh diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index dff897eb6..29db7af2e 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -1222,7 +1222,8 @@ class NodeFactory(object): 'random_hsm', 'feerates', 'wait_for_bitcoind_sync', - 'allow_bad_gossip' + 'allow_bad_gossip', + 'start', ] node_opts = {k: v for k, v in opts.items() if k in node_opt_keys} cli_opts = {k: v for k, v in opts.items() if k not in node_opt_keys} diff --git a/lightningd/lightningd.c b/lightningd/lightningd.c index edf4281c2..4ae921963 100644 --- a/lightningd/lightningd.c +++ b/lightningd/lightningd.c @@ -211,6 +211,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx) ld->listen = true; ld->autolisten = true; ld->reconnect = true; + ld->try_reexec = false; /*~ This is from ccan/timer: it is efficient for the case where timers * are deleted before expiry (as is common with timeouts) using an @@ -849,6 +850,8 @@ int main(int argc, char *argv[]) struct rlimit nofile = {1024, 1024}; int sigchld_rfd; int exit_code = 0; + char **orig_argv; + bool try_reexec; /*~ Make sure that we limit ourselves to something reasonable. Modesty * is a virtue. */ @@ -883,6 +886,17 @@ int main(int argc, char *argv[]) ld = new_lightningd(NULL); ld->state = LD_STATE_RUNNING; + /*~ We store an copy of our arguments before parsing mangles them, so + * we can re-exec if versions of subdaemons change. Note the use of + * notleak() since our leak-detector can't find orig_argv on the + * stack. */ + orig_argv = notleak(tal_arr(ld, char *, argc + 1)); + for (size_t i = 1; i < argc; i++) + orig_argv[i] = tal_strdup(orig_argv, argv[i]); + /*~ Turn argv[0] into an absolute path (if not already) */ + orig_argv[0] = path_join(orig_argv, take(path_cwd(NULL)), argv[0]); + orig_argv[argc] = NULL; + /* Figure out where our daemons are first. */ ld->daemon_dir = find_daemon_dir(ld, argv[0]); if (!ld->daemon_dir) @@ -1139,6 +1153,11 @@ int main(int argc, char *argv[]) * ld->payments, so clean that up. */ clean_tmpctx(); + /* Gather these before we free ld! */ + try_reexec = ld->try_reexec; + if (try_reexec) + tal_steal(NULL, orig_argv); + /* Free this last: other things may clean up timers. */ timers = tal_steal(NULL, ld->timers); tal_free(ld); @@ -1156,6 +1175,21 @@ int main(int argc, char *argv[]) tal_free(stop_response); } + /* Were we supposed to restart ourselves? */ + if (try_reexec) { + long max_fd; + + /* Give a reasonable chance for the install to finish. */ + sleep(5); + + /* Close all filedescriptors except stdin/stdout/stderr */ + max_fd = sysconf(_SC_OPEN_MAX); + for (int i = STDERR_FILENO+1; i < max_fd; i++) + close(i); + execv(orig_argv[0], orig_argv); + err(1, "Failed to re-exec ourselves after version change"); + } + /*~ Farewell. Next stop: hsmd/hsmd.c. */ return exit_code; } diff --git a/lightningd/lightningd.h b/lightningd/lightningd.h index 83e1f899d..22dba04f0 100644 --- a/lightningd/lightningd.h +++ b/lightningd/lightningd.h @@ -275,6 +275,9 @@ struct lightningd { /* The round-robin list of channels, for use when doing MPP. */ u64 rr_counter; + + /* Should we re-exec ourselves instead of just exiting? */ + bool try_reexec; }; /* Turning this on allows a tal allocation to return NULL, rather than aborting. diff --git a/lightningd/subd.c b/lightningd/subd.c index a9f6e1bc6..c0467d4c1 100644 --- a/lightningd/subd.c +++ b/lightningd/subd.c @@ -412,8 +412,12 @@ static bool handle_version(struct subd *sd, const u8 *msg) return false; if (!streq(ver, version())) { - fatal("subdaemon %s version '%s' not '%s'", - sd->name, ver, version()); + log_broken(sd->log, "version '%s' not '%s': restarting", + ver, version()); + sd->ld->try_reexec = true; + /* Return us to toplevel lightningd.c */ + io_break(sd->ld); + return false; } return true; } @@ -473,7 +477,7 @@ static struct io_plan *sd_msg_read(struct io_conn *conn, struct subd *sd) goto next; case WIRE_STATUS_VERSION: if (!handle_version(sd, sd->msg_in)) - goto malformed; + goto close; goto next; } diff --git a/tests/plugins/badopeningd.sh b/tests/plugins/badopeningd.sh new file mode 100755 index 000000000..b674ca2a4 --- /dev/null +++ b/tests/plugins/badopeningd.sh @@ -0,0 +1,11 @@ +#! /bin/sh + +# If this file exists, we send that message back, then sleep. +if [ $# = 0 ] && [ -f openingd-version ]; then + # lightningd expects us to write to stdin! + cat openingd-version >&0 + sleep 10 + exit 0 +fi + +exec "$(cat openingd-real)" "$@" diff --git a/tests/test_misc.py b/tests/test_misc.py index b3668494e..36a674496 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -2499,3 +2499,33 @@ def test_listforwards(node_factory, bitcoind): # out_channel=c24 c24_forwards = l2.rpc.listforwards(out_channel=c24)['forwards'] assert len(c24_forwards) == 1 + + +def test_version_reexec(node_factory, bitcoind): + badopeningd = os.path.join(os.path.dirname(__file__), "plugins", "badopeningd.sh") + version = subprocess.check_output(['lightningd/lightningd', + '--version']).decode('utf-8').splitlines()[0] + + l1, l2 = node_factory.get_nodes(2, opts=[{'subdaemon': 'openingd:' + badopeningd, + 'start': False, + 'allow_broken_log': True}, + {}]) + # We use a file to tell our openingd wrapper where the real one is + with open(os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-real"), 'w') as f: + f.write(os.path.abspath('lightningd/lightning_openingd')) + + l1.start() + # This is a "version" message + verfile = os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-version") + with open(verfile, 'wb') as f: + f.write(bytes.fromhex('0000000d' # len + 'fff6')) # type + f.write(bytes('badversion\0', encoding='utf8')) + + l1.rpc.connect(l2.info['id'], 'localhost', l2.port) + + l1.daemon.wait_for_log("openingd.*version 'badversion' not '{}': restarting".format(version)) + + # Now "fix" it, it should restart. + os.unlink(verfile) + l1.daemon.wait_for_log("Server started with public key")