mirror of
https://github.com/ElementsProject/lightning.git
synced 2025-02-22 14:42:40 +01:00
lightningd: don't abort on incorrect versions, but try to re-exec.
You still shouldn't do this (you could get some transient failures), but at least you have a decent chance if you reinstall over a running daemon, instead of getting confusing internal errors if message formats have changed. Changelog-Added: lightningd: we now try to restart if subdaemons are upgraded underneath us. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Fixes: #4346
This commit is contained in:
parent
8714bf903c
commit
32d650f9df
6 changed files with 87 additions and 4 deletions
|
@ -1222,7 +1222,8 @@ class NodeFactory(object):
|
||||||
'random_hsm',
|
'random_hsm',
|
||||||
'feerates',
|
'feerates',
|
||||||
'wait_for_bitcoind_sync',
|
'wait_for_bitcoind_sync',
|
||||||
'allow_bad_gossip'
|
'allow_bad_gossip',
|
||||||
|
'start',
|
||||||
]
|
]
|
||||||
node_opts = {k: v for k, v in opts.items() if k in node_opt_keys}
|
node_opts = {k: v for k, v in opts.items() if k in node_opt_keys}
|
||||||
cli_opts = {k: v for k, v in opts.items() if k not in node_opt_keys}
|
cli_opts = {k: v for k, v in opts.items() if k not in node_opt_keys}
|
||||||
|
|
|
@ -211,6 +211,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx)
|
||||||
ld->listen = true;
|
ld->listen = true;
|
||||||
ld->autolisten = true;
|
ld->autolisten = true;
|
||||||
ld->reconnect = true;
|
ld->reconnect = true;
|
||||||
|
ld->try_reexec = false;
|
||||||
|
|
||||||
/*~ This is from ccan/timer: it is efficient for the case where timers
|
/*~ This is from ccan/timer: it is efficient for the case where timers
|
||||||
* are deleted before expiry (as is common with timeouts) using an
|
* are deleted before expiry (as is common with timeouts) using an
|
||||||
|
@ -849,6 +850,8 @@ int main(int argc, char *argv[])
|
||||||
struct rlimit nofile = {1024, 1024};
|
struct rlimit nofile = {1024, 1024};
|
||||||
int sigchld_rfd;
|
int sigchld_rfd;
|
||||||
int exit_code = 0;
|
int exit_code = 0;
|
||||||
|
char **orig_argv;
|
||||||
|
bool try_reexec;
|
||||||
|
|
||||||
/*~ Make sure that we limit ourselves to something reasonable. Modesty
|
/*~ Make sure that we limit ourselves to something reasonable. Modesty
|
||||||
* is a virtue. */
|
* is a virtue. */
|
||||||
|
@ -883,6 +886,17 @@ int main(int argc, char *argv[])
|
||||||
ld = new_lightningd(NULL);
|
ld = new_lightningd(NULL);
|
||||||
ld->state = LD_STATE_RUNNING;
|
ld->state = LD_STATE_RUNNING;
|
||||||
|
|
||||||
|
/*~ We store an copy of our arguments before parsing mangles them, so
|
||||||
|
* we can re-exec if versions of subdaemons change. Note the use of
|
||||||
|
* notleak() since our leak-detector can't find orig_argv on the
|
||||||
|
* stack. */
|
||||||
|
orig_argv = notleak(tal_arr(ld, char *, argc + 1));
|
||||||
|
for (size_t i = 1; i < argc; i++)
|
||||||
|
orig_argv[i] = tal_strdup(orig_argv, argv[i]);
|
||||||
|
/*~ Turn argv[0] into an absolute path (if not already) */
|
||||||
|
orig_argv[0] = path_join(orig_argv, take(path_cwd(NULL)), argv[0]);
|
||||||
|
orig_argv[argc] = NULL;
|
||||||
|
|
||||||
/* Figure out where our daemons are first. */
|
/* Figure out where our daemons are first. */
|
||||||
ld->daemon_dir = find_daemon_dir(ld, argv[0]);
|
ld->daemon_dir = find_daemon_dir(ld, argv[0]);
|
||||||
if (!ld->daemon_dir)
|
if (!ld->daemon_dir)
|
||||||
|
@ -1139,6 +1153,11 @@ int main(int argc, char *argv[])
|
||||||
* ld->payments, so clean that up. */
|
* ld->payments, so clean that up. */
|
||||||
clean_tmpctx();
|
clean_tmpctx();
|
||||||
|
|
||||||
|
/* Gather these before we free ld! */
|
||||||
|
try_reexec = ld->try_reexec;
|
||||||
|
if (try_reexec)
|
||||||
|
tal_steal(NULL, orig_argv);
|
||||||
|
|
||||||
/* Free this last: other things may clean up timers. */
|
/* Free this last: other things may clean up timers. */
|
||||||
timers = tal_steal(NULL, ld->timers);
|
timers = tal_steal(NULL, ld->timers);
|
||||||
tal_free(ld);
|
tal_free(ld);
|
||||||
|
@ -1156,6 +1175,21 @@ int main(int argc, char *argv[])
|
||||||
tal_free(stop_response);
|
tal_free(stop_response);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Were we supposed to restart ourselves? */
|
||||||
|
if (try_reexec) {
|
||||||
|
long max_fd;
|
||||||
|
|
||||||
|
/* Give a reasonable chance for the install to finish. */
|
||||||
|
sleep(5);
|
||||||
|
|
||||||
|
/* Close all filedescriptors except stdin/stdout/stderr */
|
||||||
|
max_fd = sysconf(_SC_OPEN_MAX);
|
||||||
|
for (int i = STDERR_FILENO+1; i < max_fd; i++)
|
||||||
|
close(i);
|
||||||
|
execv(orig_argv[0], orig_argv);
|
||||||
|
err(1, "Failed to re-exec ourselves after version change");
|
||||||
|
}
|
||||||
|
|
||||||
/*~ Farewell. Next stop: hsmd/hsmd.c. */
|
/*~ Farewell. Next stop: hsmd/hsmd.c. */
|
||||||
return exit_code;
|
return exit_code;
|
||||||
}
|
}
|
||||||
|
|
|
@ -275,6 +275,9 @@ struct lightningd {
|
||||||
|
|
||||||
/* The round-robin list of channels, for use when doing MPP. */
|
/* The round-robin list of channels, for use when doing MPP. */
|
||||||
u64 rr_counter;
|
u64 rr_counter;
|
||||||
|
|
||||||
|
/* Should we re-exec ourselves instead of just exiting? */
|
||||||
|
bool try_reexec;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Turning this on allows a tal allocation to return NULL, rather than aborting.
|
/* Turning this on allows a tal allocation to return NULL, rather than aborting.
|
||||||
|
|
|
@ -412,8 +412,12 @@ static bool handle_version(struct subd *sd, const u8 *msg)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (!streq(ver, version())) {
|
if (!streq(ver, version())) {
|
||||||
fatal("subdaemon %s version '%s' not '%s'",
|
log_broken(sd->log, "version '%s' not '%s': restarting",
|
||||||
sd->name, ver, version());
|
ver, version());
|
||||||
|
sd->ld->try_reexec = true;
|
||||||
|
/* Return us to toplevel lightningd.c */
|
||||||
|
io_break(sd->ld);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -473,7 +477,7 @@ static struct io_plan *sd_msg_read(struct io_conn *conn, struct subd *sd)
|
||||||
goto next;
|
goto next;
|
||||||
case WIRE_STATUS_VERSION:
|
case WIRE_STATUS_VERSION:
|
||||||
if (!handle_version(sd, sd->msg_in))
|
if (!handle_version(sd, sd->msg_in))
|
||||||
goto malformed;
|
goto close;
|
||||||
goto next;
|
goto next;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
11
tests/plugins/badopeningd.sh
Executable file
11
tests/plugins/badopeningd.sh
Executable file
|
@ -0,0 +1,11 @@
|
||||||
|
#! /bin/sh
|
||||||
|
|
||||||
|
# If this file exists, we send that message back, then sleep.
|
||||||
|
if [ $# = 0 ] && [ -f openingd-version ]; then
|
||||||
|
# lightningd expects us to write to stdin!
|
||||||
|
cat openingd-version >&0
|
||||||
|
sleep 10
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
exec "$(cat openingd-real)" "$@"
|
|
@ -2499,3 +2499,33 @@ def test_listforwards(node_factory, bitcoind):
|
||||||
# out_channel=c24
|
# out_channel=c24
|
||||||
c24_forwards = l2.rpc.listforwards(out_channel=c24)['forwards']
|
c24_forwards = l2.rpc.listforwards(out_channel=c24)['forwards']
|
||||||
assert len(c24_forwards) == 1
|
assert len(c24_forwards) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_version_reexec(node_factory, bitcoind):
|
||||||
|
badopeningd = os.path.join(os.path.dirname(__file__), "plugins", "badopeningd.sh")
|
||||||
|
version = subprocess.check_output(['lightningd/lightningd',
|
||||||
|
'--version']).decode('utf-8').splitlines()[0]
|
||||||
|
|
||||||
|
l1, l2 = node_factory.get_nodes(2, opts=[{'subdaemon': 'openingd:' + badopeningd,
|
||||||
|
'start': False,
|
||||||
|
'allow_broken_log': True},
|
||||||
|
{}])
|
||||||
|
# We use a file to tell our openingd wrapper where the real one is
|
||||||
|
with open(os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-real"), 'w') as f:
|
||||||
|
f.write(os.path.abspath('lightningd/lightning_openingd'))
|
||||||
|
|
||||||
|
l1.start()
|
||||||
|
# This is a "version" message
|
||||||
|
verfile = os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-version")
|
||||||
|
with open(verfile, 'wb') as f:
|
||||||
|
f.write(bytes.fromhex('0000000d' # len
|
||||||
|
'fff6')) # type
|
||||||
|
f.write(bytes('badversion\0', encoding='utf8'))
|
||||||
|
|
||||||
|
l1.rpc.connect(l2.info['id'], 'localhost', l2.port)
|
||||||
|
|
||||||
|
l1.daemon.wait_for_log("openingd.*version 'badversion' not '{}': restarting".format(version))
|
||||||
|
|
||||||
|
# Now "fix" it, it should restart.
|
||||||
|
os.unlink(verfile)
|
||||||
|
l1.daemon.wait_for_log("Server started with public key")
|
||||||
|
|
Loading…
Add table
Reference in a new issue