core-lightning/lightningd/lightningd.c
Rusty Russell 41ef85318d onionmessages: remove obsolete onion message parsing.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2022-09-29 16:10:57 +09:30

1293 lines
47 KiB
C

/*~ Welcome, wonderful reader!
*
* This is the Core of um, Core Lightning: the main file of the master daemon
* `lightningd`. It's mainly cluttered with the miscellany of setup,
* and a few startup sanity checks.
*
* The role of this daemon is to start the subdaemons, shuffle peers
* between them, handle the JSON RPC requests, bitcoind, the database
* and centralize logging. In theory, it doesn't trust the other
* daemons, though we expect `hsmd` (which holds secret keys) to be
* responsive.
*
* Comments beginning with a ~ (like this one!) are part of our shared
* adventure through the source, so they're more meta than normal code
* comments, and meant to be read in a certain order.
*/
/*~ Notice how includes are in ASCII order: this is actually enforced by
* the build system under `make check-source`. It avoids merge conflicts
* and keeps things consistent. It also make sure you include "config.h"
* before anything else. */
#include "config.h"
/*~ This is Ian Lance Taylor's libbacktrace. It turns out that it's
* horrifically difficult to obtain a decent backtrace in C; the standard
* backtrace function is useless in most programs. */
/*~ These headers are from CCAN: http://ccodearchive.net.
*
* It's another one of Rusty's projects, and we copy and paste it
* automatically into the source tree here, so you should never edit
* it. There's a Makefile target update-ccan to update it (and add modules
* if CCAN_NEW is specified).
*
* The most used of these are `ccan/tal` and `ccan/take`, which we'll describe
* in detail below.
*/
#include <ccan/array_size/array_size.h>
#include <ccan/closefrom/closefrom.h>
#include <ccan/json_escape/json_escape.h>
#include <ccan/opt/opt.h>
#include <ccan/pipecmd/pipecmd.h>
#include <ccan/read_write_all/read_write_all.h>
#include <ccan/tal/grab_file/grab_file.h>
#include <ccan/tal/path/path.h>
#include <ccan/tal/str/str.h>
/*~ This is common code: routines shared by one or more executables
* (separate daemons, or the lightning-cli program). */
#include <common/daemon.h>
#include <common/ecdh_hsmd.h>
#include <common/hsm_encryption.h>
#include <common/memleak.h>
#include <common/timeout.h>
#include <common/type_to_string.h>
#include <common/version.h>
#include <db/exec.h>
#include <errno.h>
#include <fcntl.h>
#include <header_versions_gen.h>
#include <lightningd/chaintopology.h>
#include <lightningd/channel.h>
#include <lightningd/channel_control.h>
#include <lightningd/coin_mvts.h>
#include <lightningd/connect_control.h>
#include <lightningd/gossip_control.h>
#include <lightningd/hsm_control.h>
#include <lightningd/io_loop_with_timers.h>
#include <lightningd/lightningd.h>
#include <lightningd/onchain_control.h>
#include <lightningd/options.h>
#include <lightningd/peer_control.h>
#include <lightningd/plugin.h>
#include <lightningd/subd.h>
#include <sys/resource.h>
#include <wallet/txfilter.h>
#include <wally_bip32.h>
static void destroy_alt_subdaemons(struct lightningd *ld);
#if DEVELOPER
static void memleak_help_alt_subdaemons(struct htable *memtable,
struct lightningd *ld);
#endif /* DEVELOPER */
/*~ The core lightning object: it's passed everywhere, and is basically a
* global variable. This new_xxx pattern is something we'll see often:
* it allocates and initializes a new structure, using *tal*, the hierarchical
* allocator. */
static struct lightningd *new_lightningd(const tal_t *ctx)
{
/*~ tal: each allocation is a child of an existing object (or NULL,
* the top-level object). When an object is freed, all the objects
* `tallocated` off it are also freed. We use it in place of malloc
* and free. For the technically inclined: tal allocations usually
* build a tree, and tal_freeing any node in the tree will result in
* the entire subtree rooted at that node to be freed.
*
* It's incredibly useful for grouping object lifetimes, as we'll see.
* For example, a `struct lightningd` has a pointer to a `log_book`
* which is allocated off the `struct lightningd`, and has its own
* internal members allocated off `log_book`: freeing `struct
* lightningd` frees them all.
*
* In this case, freeing `ctx` will free `ld`:
*/
struct lightningd *ld = tal(ctx, struct lightningd);
/*~ Style note: `ctx` is declared `const`, yet we can `tallocate` from
* it. Adding/removing children is not considered to change an
* object; nor, in fact, is freeing it with tal_free(). This allows
* us to use const more liberally: the style rule here is that you
* should use 'const' on pointers if you can. */
/*~ Note that we generally EXPLICITLY #if-wrap DEVELOPER code. This
* is a nod to keeping it minimal and explicit: we need this code for
* testing, but its existence means we're not actually testing the
* same exact code users will be running. */
#if DEVELOPER
ld->dev_debug_subprocess = NULL;
ld->dev_no_plugin_checksum = false;
ld->dev_disconnect_fd = -1;
ld->dev_subdaemon_fail = false;
ld->dev_allow_localhost = false;
ld->dev_gossip_time = 0;
ld->dev_fast_gossip = false;
ld->dev_fast_gossip_prune = false;
ld->dev_fast_reconnect = false;
ld->dev_force_privkey = NULL;
ld->dev_force_bip32_seed = NULL;
ld->dev_force_channel_secrets = NULL;
ld->dev_force_channel_secrets_shaseed = NULL;
ld->dev_force_tmp_channel_id = NULL;
ld->dev_no_htlc_timeout = false;
ld->dev_no_version_checks = false;
ld->dev_max_funding_unconfirmed = 2016;
ld->dev_ignore_modern_onion = false;
ld->dev_disable_commit = -1;
ld->dev_no_ping_timer = false;
#endif
/*~ These are CCAN lists: an embedded double-linked list. It's not
* really typesafe, but relies on convention to access the contents.
* It's inspired by the closely-related Linux kernel list.h.
*
* You declare them as a `struct list_head` (or use the LIST_HEAD()
* macro which doesn't work on dynamically-allocated objects like `ld`
* here). The item which will go into the list must be declared
* a `struct list_node` for each list it can be in.
*
* The most common operations are list_head_init(), list_add(),
* list_del() and list_for_each().
*
* This method of manually declaring the list hooks avoids dynamic
* allocations to put things into a list. */
list_head_init(&ld->peers);
list_head_init(&ld->subds);
/*~ These are hash tables of incoming and outgoing HTLCs (contracts),
* defined as `struct htlc_in` and `struct htlc_out` in htlc_end.h.
* The hash tables are declared there using the very ugly
* HTABLE_DEFINE_TYPE macro. The key is the channel the HTLC is in
* and the 64-bit htlc-id which is unique for that channel and
* direction. That htlc-id is used in the inter-peer wire protocol,
* so it is the logical key.
*
* There aren't usually many HTLCs, so we could have just used a linked
* list attached to the channel structure itself, or even left them in
* the database rather than making an in-memory version. Obviously
* I was in a premature optimization mood when I wrote this: */
htlc_in_map_init(&ld->htlcs_in);
htlc_out_map_init(&ld->htlcs_out);
/*~ For multi-part payments, we need to keep some incoming payments
* in limbo until we get all the parts, or we time them out. */
htlc_set_map_init(&ld->htlc_sets);
/*~ We have a multi-entry log-book infrastructure: we define a 10MB log
* book to hold all the entries (and trims as necessary), and multiple
* log objects which each can write into it, each with a unique
* prefix. */
ld->log_book = new_log_book(ld, 10*1024*1024);
/*~ Note the tal context arg (by convention, the first argument to any
* allocation function): ld->log will be implicitly freed when ld
* is. */
ld->log = new_log(ld, ld->log_book, NULL, "lightningd");
ld->logfiles = NULL;
/*~ We explicitly set these to NULL: if they're still NULL after option
* parsing, we know they're to be set to the defaults. */
ld->alias = NULL;
ld->rgb = NULL;
list_head_init(&ld->connects);
list_head_init(&ld->waitsendpay_commands);
list_head_init(&ld->sendpay_commands);
list_head_init(&ld->close_commands);
list_head_init(&ld->ping_commands);
list_head_init(&ld->disconnect_commands);
list_head_init(&ld->waitblockheight_commands);
/*~ Tal also explicitly supports arrays: it stores the number of
* elements, which can be accessed with tal_count() (or tal_bytelen()
* for raw bytecount). It's common for simple arrays to use
* tal_resize() (or tal_arr_expand) to expand, which does not work on
* NULL. So we start with a zero-length array. */
ld->proposed_wireaddr = tal_arr(ld, struct wireaddr_internal, 0);
ld->proposed_listen_announce = tal_arr(ld, enum addr_listen_announce, 0);
ld->remote_addr_v4 = NULL;
ld->remote_addr_v6 = NULL;
ld->discovered_ip_v4 = NULL;
ld->discovered_ip_v6 = NULL;
ld->listen = true;
ld->autolisten = true;
ld->reconnect = true;
ld->try_reexec = false;
ld->db_upgrade_ok = NULL;
/*~ This is from ccan/timer: it is efficient for the case where timers
* are deleted before expiry (as is common with timeouts) using an
* ingenious bucket system which more precisely sorts timers as they
* approach expiry. It's a fascinating implementation you should read
* if you have a spare few hours. */
ld->timers = tal(ld, struct timers);
timers_init(ld->timers, time_mono());
/*~ This is detailed in chaintopology.c */
ld->topology = new_topology(ld, ld->log);
ld->blockheight = 0;
ld->daemon_parent_fd = -1;
ld->proxyaddr = NULL;
ld->always_use_proxy = false;
ld->pure_tor_setup = false;
ld->tor_service_password = NULL;
ld->websocket_port = 0;
/*~ This is initialized later, but the plugin loop examines this,
* so set it to NULL explicitly now. */
ld->wallet = NULL;
/*~ Behavioral options */
ld->accept_extra_tlv_types = tal_arr(ld, u64, 0);
/*~ In the next step we will initialize the plugins. This will
* also populate the JSON-RPC with passthrough methods, hence
* lightningd needs to have something to put those in. This
* is that :-)
*/
jsonrpc_setup(ld);
/*~ We run a number of plugins (subprocesses that we talk JSON-RPC with)
* alongside this process. This allows us to have an easy way for users
* to add their own tools without having to modify the Core Lightning source
* code. Here we initialize the context that will keep track and control
* the plugins.
*/
ld->plugins = plugins_new(ld, ld->log_book, ld);
ld->plugins->startup = true;
/*~ This is set when a JSON RPC command comes in to shut us down. */
ld->stop_conn = NULL;
/*~ This is used to signal that `hsm_secret` is encrypted, and will
* be set to `true` if the `--encrypted-hsm` option is passed at startup.
*/
ld->encrypted_hsm = false;
/* This is used to override subdaemons */
strmap_init(&ld->alt_subdaemons);
tal_add_destructor(ld, destroy_alt_subdaemons);
memleak_add_helper(ld, memleak_help_alt_subdaemons);
/*~ We change umask if we daemonize, but not if we don't. Initialize the
* initial_umask anyway as we might rely on it later (`plugin start`). */
ld->initial_umask = umask(0);
umask(ld->initial_umask);
/*~ This is the mode of the created JSON-RPC socket file, in
* traditional Unix octal. 0600 means only the user that ran
* lightningd can invoke RPC on it. Changing it to 0660 may
* be sensible if you run lightningd in its own system user,
* and just let specific users (add the group of the
* lightningd runner as an ancillary group) access its
* RPC. Can be overridden with `--rpc-file-mode`.
*/
ld->rpc_filemode = 0600;
/*~ This is the exit code to use on exit.
* Set to NULL meaning we are not interested in exiting yet.
*/
ld->exit_code = NULL;
/*~ We maintain a round-robin list of channels.
* This round-robin list of channels is used to ensure that
* each invoice we generate has a different set of channels. */
ld->rr_counter = 0;
/*~ Because fee estimates on testnet and regtest are unreliable,
* we allow overriding them with --force-feerates, in which
* case this is a pointer to an enum feerate-indexed array of values */
ld->force_feerates = NULL;
return ld;
}
/*~ We list our daemons here so on startup we can test they're the
* correct versions and that they exist. */
static const char *subdaemons[] = {
"lightning_channeld",
"lightning_closingd",
"lightning_connectd",
"lightning_gossipd",
"lightning_hsmd",
"lightning_onchaind",
"lightning_openingd"
};
/* Return true if called with a recognized subdaemon e.g. "hsmd" */
bool is_subdaemon(const char *sdname)
{
for (size_t i = 0; i < ARRAY_SIZE(subdaemons); i++)
/* Skip the "lightning_" prefix in the table */
if (streq(sdname, subdaemons[i] + strlen("lightning_")))
return true;
return false;
}
static void destroy_alt_subdaemons(struct lightningd *ld)
{
strmap_clear(&ld->alt_subdaemons);
}
#if DEVELOPER
static void memleak_help_alt_subdaemons(struct htable *memtable,
struct lightningd *ld)
{
memleak_scan_strmap(memtable, &ld->alt_subdaemons);
}
#endif /* DEVELOPER */
const char *subdaemon_path(const tal_t *ctx, const struct lightningd *ld, const char *name)
{
/* Strip the leading "lightning_" before looking in alt_subdaemons.
*/
size_t pfxlen = strlen("lightning_");
assert(strlen(name) > pfxlen);
const char *short_name = tal_strdup(ctx, name + pfxlen);
/* Is there an alternate path for this subdaemon? */
const char *dpath;
const char *alt = strmap_get(&ld->alt_subdaemons, short_name);
if (alt) {
/* path_join will honor absolute paths as well. */
dpath = path_join(ctx, ld->daemon_dir, alt);
} else {
/* This subdaemon is found in the standard place. */
dpath = path_join(ctx, ld->daemon_dir, name);
}
return dpath;
}
/*~ Check we can run them, and check their versions */
void test_subdaemons(const struct lightningd *ld)
{
size_t i;
/*~ CCAN's ARRAY_SIZE() should always be used on defined arrays like
* the subdaemons array above. You can calculate the number of
* elements it has using `sizeof(subdaemons)/sizeof(subdaemons[0])`
* but if `subdaemons` were refactored into a pointer (eg. to make
* it a dynamic array) that would erroneously evaluate to `1`.
*
* ARRAY_SIZE will cause a compiler error if the argument is actually
* a pointer, not an array. */
for (i = 0; i < ARRAY_SIZE(subdaemons); i++) {
/*~ CCAN's path module uses tal, so wants a context to
* allocate from. We have a magic convenience context
* `tmpctx` for temporary allocations like this.
*
* Because all our daemons at their core are of form `while
* (!stopped) handle_events();` (an event loop pattern), we
* can free `tmpctx` in that top-level loop after each event
* is handled.
*/
int outfd;
const char *dpath = subdaemon_path(tmpctx, ld, subdaemons[i]);
const char *verstring;
/*~ CCAN's pipecmd module is like popen for grownups: it
* takes pointers to fill in stdin, stdout and stderr file
* descriptors if desired, and the remainder of arguments
* are the command and its argument. */
pid_t pid = pipecmd(NULL, &outfd, &outfd,
dpath, "--version", NULL);
/*~ Our logging system: spam goes in at log_debug level, but
* logging is mainly added by developer necessity and removed
* by developer/user complaints. The only strong convention
* is that log_broken() is used for "should never happen".
*
* Note, however, that logging takes care to preserve the
* global `errno` which is set above. */
log_debug(ld->log, "testing %s", dpath);
/*~ ccan/err is a wrapper around BSD's err.h, which defines
* the convenience functions err() (error with message
* followed by a string based on errno) and errx() (same,x
* but no errno string). */
if (pid == -1)
err(EXITCODE_SUBDAEMON_FAIL, "Could not run %s", dpath);
/*~ CCAN's grab_file module contains a routine to read into a
* tallocated buffer until EOF */
verstring = grab_fd(tmpctx, outfd);
/*~ Like many CCAN modules, it set errno on failure, which
* err (ccan/err, but usually just the BSD <err.h>) prints */
if (!verstring)
err(1, "Could not get output from %s", dpath);
/*~ strstarts is from CCAN/str. */
if (!strstarts(verstring, version())
|| verstring[strlen(version())] != '\n')
errx(EXITCODE_SUBDAEMON_FAIL, "%s: bad version '%s'",
subdaemons[i], verstring);
/*~ The child will be reaped by sigchld_rfd_in, so we don't
* need to waitpid() here. */
}
}
/* Check if all subdaemons exist in specified directory. */
static bool has_all_subdaemons(const char *daemon_dir)
{
size_t i;
bool missing_daemon = false;
for (i = 0; i < ARRAY_SIZE(subdaemons); ++i) {
if (!path_is_file(path_join(tmpctx, daemon_dir, subdaemons[i]))) {
missing_daemon = true;
break;
}
}
return !missing_daemon;
}
/* Returns the directory this executable is running from */
static const char *find_my_directory(const tal_t *ctx, const char *argv0)
{
/* find_my_abspath simply exits on failure, so never returns NULL. */
const char *me = find_my_abspath(NULL, argv0);
/*~ The caller just wants the directory we're in.
*
* Note the magic `take()` macro here: it annotates a pointer as "to
* be taken", and the recipient is expected to take ownership of the
* pointer. This improves efficiency because the recipient might
* choose to use or even keep it rather than make a copy (or it
* might just free it).
*
* Many CCAN and our own routines support this, but if you hand a
* `take()` to a routine which *doesn't* expect it, unfortunately you
* don't get a compile error (we have runtime detection for this
* case, however).
*/
return path_dirname(ctx, take(me));
}
/*~ This returns the PKGLIBEXEC path which is where binaries get installed.
* Note the `TAKES` annotation which indicates that the `my_path` parameter
* can be take(); in which case, this function will handle freeing it.
*
* TAKES is only a convention unfortunately, and ignored by the compiler.
*/
static const char *find_my_pkglibexec_path(struct lightningd *ld,
const char *my_path TAKES)
{
const char *pkglibexecdir;
/*~`path_join` is declared in ccan/path/path.h as:
*
* char *path_join(const tal_t *ctx,
* const char *base TAKES, const char *a TAKES);
*
* So, as we promised with 'TAKES' in our own declaration, if the
* caller has called `take()` the `my_path` parameter, path_join()
* will free it. */
pkglibexecdir = path_join(NULL, my_path, BINTOPKGLIBEXECDIR);
/*~ The plugin dir is in ../libexec/c-lightning/plugins, which (unlike
* those given on the command line) does not need to exist. */
plugins_set_builtin_plugins_dir(ld->plugins,
path_join(tmpctx,
pkglibexecdir, "plugins"));
/*~ Sometimes take() can be more efficient, since the routine can
* manipulate the string in place. This is the case here. */
return path_simplify(ld, take(pkglibexecdir));
}
/* Determine the correct daemon dir. */
static const char *find_daemon_dir(struct lightningd *ld, const char *argv0)
{
const char *my_path = find_my_directory(ld, argv0);
/* If we're running in-tree, all the subdaemons are with lightningd. */
if (has_all_subdaemons(my_path)) {
/* In this case, look for built-in plugins in ../plugins */
plugins_set_builtin_plugins_dir(ld->plugins,
path_join(tmpctx,
my_path,
"../plugins"));
return my_path;
}
/* Otherwise we assume they're in the installed dir. */
return find_my_pkglibexec_path(ld, take(my_path));
}
/*~ We like to free everything on exit, so valgrind doesn't complain (valgrind
* is an awesome runtime memory usage detector for C and C++ programs). In
* some ways it would be neater not to do this, but it turns out some
* transient objects still need cleaning. */
static void free_all_channels(struct lightningd *ld)
{
struct peer *p;
/*~ tal supports *destructors* using `tal_add_destructor()`; the most
* common use is for an object to delete itself from a linked list
* when it's freed.
*
* As a result, freeing an object (which frees any tal objects
* allocated off it, and any allocated off them, etc) may cause
* callbacks; in this case, some objects freed here can cause database
* writes, which must be inside a transaction. */
db_begin_transaction(ld->wallet->db);
/* Now we free all the HTLCs */
free_htlcs(ld, NULL);
/*~ For every peer, we free every channel. On allocation the peer was
* given a destructor (`destroy_peer`) which removes itself from the
* list. Thus we use list_top() not list_pop() here. */
while ((p = list_top(&ld->peers, struct peer, list)) != NULL) {
struct channel *c;
/*~ A peer can have multiple channels; we only allow one to be
* open at any time, but we remember old ones for 100 blocks,
* after all the outputs we care about are spent. */
while ((c = list_top(&p->channels, struct channel, list))
!= NULL) {
/* Removes itself from list as we free it */
tal_free(c);
}
/* A peer may have a channel in the process of opening. */
if (p->uncommitted_channel) {
struct uncommitted_channel *uc = p->uncommitted_channel;
/* Setting to NULL stops destroy_uncommitted_channel
* from trying to remove peer from db! */
p->uncommitted_channel = NULL;
tal_free(uc);
}
/* Removes itself from list as we free it */
tal_free(p);
}
/*~ Commit the transaction. Note that the db is actually
* single-threaded, so commits never fail and we don't need
* spin-and-retry logic everywhere. */
db_commit_transaction(ld->wallet->db);
}
static void shutdown_global_subdaemons(struct lightningd *ld)
{
/* Let everyone shutdown cleanly. */
close(ld->hsm_fd);
/*~ The three "global" daemons, which we shutdown explicitly: we
* give them 10 seconds to exit gracefully before killing them. */
ld->connectd = subd_shutdown(ld->connectd, 10);
ld->gossip = subd_shutdown(ld->gossip, 10);
ld->hsm = subd_shutdown(ld->hsm, 10);
}
/*~ Our wallet logic needs to know what outputs we might be interested in. We
* use BIP32 (a.k.a. "HD wallet") to generate keys from a single seed, so we
* keep the maximum-ever-used key index in the db, and add them all to the
* filter here. */
static void init_txfilter(struct wallet *w, struct txfilter *filter)
{
/*~ This is defined in libwally, so we didn't have to reimplement */
struct ext_key ext;
/*~ Note the use of ccan/short_types u64 rather than uint64_t.
* Thank me later. */
u64 bip32_max_index;
bip32_max_index = db_get_intvar(w->db, "bip32_max_index", 0);
/*~ One of the C99 things I unequivocally approve: for-loop scope. */
for (u64 i = 0; i <= bip32_max_index + w->keyscan_gap; i++) {
if (bip32_key_from_parent(w->bip32_base, i, BIP32_FLAG_KEY_PUBLIC, &ext) != WALLY_OK) {
abort();
}
txfilter_add_derkey(filter, ext.pub_key);
}
}
/*~ The normal advice for daemons is to move into the root directory, so you
* don't prevent unmounting whatever filesystem you happen to start in.
*
* But we define every path relative to our (~/.lightning) data dir, so we
* make sure we stay there. The rest of this is taken from ccan/daemonize,
* which was based on W. Richard Stevens' advice in Programming in The Unix
* Environment.
*/
static void complete_daemonize(struct lightningd *ld)
{
int ok_status = 0;
/* Don't hold files open. */
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
/* Many routines write to stderr; that can cause chaos if used
* for something else, so set it here. */
if (open("/dev/null", O_WRONLY) != 0)
fatal("Could not open /dev/null: %s", strerror(errno));
if (dup2(0, STDERR_FILENO) != STDERR_FILENO)
fatal("Could not dup /dev/null for stderr: %s", strerror(errno));
close(0);
/* Session leader so ^C doesn't whack us. */
if (setsid() == (pid_t)-1)
fatal("Could not setsid: %s", strerror(errno));
/* Discard our parent's old-fashioned umask prejudices. */
ld->initial_umask = umask(0);
/* OK, parent, you can exit(0) now. */
write_all(ld->daemon_parent_fd, &ok_status, sizeof(ok_status));
close(ld->daemon_parent_fd);
}
/*~ It's pretty standard behaviour (especially for daemons) to create and
* file-lock a pidfile. This not only prevents accidentally running multiple
* daemons on the same database at once, but lets nosy sysadmins see what pid
* the currently-running daemon is supposed to be. */
static void pidfile_create(const struct lightningd *ld)
{
int pid_fd;
char *pid;
/* Create PID file: relative to .config dir. */
pid_fd = open(ld->pidfile, O_WRONLY|O_CREAT, 0640);
if (pid_fd < 0)
err(1, "Failed to open PID file");
/* Lock PID file, so future lockf will fail. */
if (lockf(pid_fd, F_TLOCK, 0) < 0)
/* Problem locking file */
err(EXITCODE_PIDFILE_LOCK, "lightningd already running? Error locking PID file");
/*~ As closing the file will remove the lock, we need to keep it open;
* the OS will close it implicitly when we exit for any reason. */
/*~ Note that tal_fmt() is what asprintf() dreams of being. */
pid = tal_fmt(tmpctx, "%d\n", getpid());
/*~ CCAN's write_all writes to a file descriptor, looping if necessary
* (which, on a file unlike a socket, is never, for historical UNIX
* reasons). It also isn't declared with GCC's warn_unused_result
* which write() is when FORTIFY_SOURCE is defined, so we're allowed
* to ignore the result without jumping through hoops. */
write_all(pid_fd, pid, strlen(pid));
}
/*~ ccan/io allows overriding the poll() function that is the very core
* of the event loop it runs for us. We override it so that we can do
* extra sanity checks, and it's also a good point to free the tmpctx. */
static int io_poll_lightningd(struct pollfd *fds, nfds_t nfds, int timeout)
{
/* These checks and freeing tmpctx are common to all daemons. */
return daemon_poll(fds, nfds, timeout);
}
/*~ Ever had one of those functions which doesn't quite fit anywhere? Me too.
* Implementing a generic notifier framework is overkill in a static codebase
* like this, and it's always better to have compile-time calls than runtime,
* as it makes the code more explicit. But pasting in direct calls is also an
* abstraction violation, so we use this middleman function. */
void notify_new_block(struct lightningd *ld, u32 block_height)
{
/* Inform our subcomponents individually. */
htlcs_notify_new_block(ld, block_height);
channel_notify_new_block(ld, block_height);
gossip_notify_new_block(ld, block_height);
waitblockheight_notify_new_block(ld, block_height);
}
static void on_sigint(int _ UNUSED)
{
static const char *msg = "lightningd: SIGINT caught, exiting.\n";
write_all(STDERR_FILENO, msg, strlen(msg));
_exit(1);
}
static void on_sigterm(int _ UNUSED)
{
static const char *msg = "lightningd: SIGTERM caught, exiting.\n";
write_all(STDERR_FILENO, msg, strlen(msg));
_exit(1);
}
/* Globals are terrible, but we all do it. */
static int sigchld_wfd;
static void on_sigchild(int _ UNUSED)
{
/*~ UNIX signals are async, which is usually terrible. The usual
* trick, which we use here, it to write a byte to a pipe, and
* then handle it in the main event loop.
*
* This can fail if we get flooded by signals but that's OK;
* we made it non-blocking, and the reader will loop until
* there are no more children. But glibc's overzealous use of
* __attribute__((warn_unused_result)) means we have to
* "catch" the return value. */
if (write(sigchld_wfd, "", 1) != 1) {
if (errno != EAGAIN && errno != EWOULDBLOCK) {
/* Should not call this in a signal handler, but we're
* already messed up! */
fatal("on_sigchild: write errno %s", strerror(errno));
}
}
}
/*~ We only need to handle SIGTERM and SIGINT for the case we are PID 1 of
* docker container since Linux makes special this PID and requires that
* some handler exist.
*
* We also want to catch SIGCHLD, so we can report on such children and
* avoid zombies. */
static int setup_sig_handlers(void)
{
struct sigaction sigint, sigterm, sigchild;
int fds[2];
memset(&sigint, 0, sizeof(struct sigaction));
memset(&sigterm, 0, sizeof(struct sigaction));
memset(&sigchild, 0, sizeof(struct sigaction));
sigint.sa_handler = on_sigint;
sigterm.sa_handler = on_sigterm;
sigchild.sa_handler = on_sigchild;
sigchild.sa_flags = SA_RESTART;
if (1 == getpid()) {
sigaction(SIGINT, &sigint, NULL);
sigaction(SIGTERM, &sigterm, NULL);
}
if (pipe(fds) != 0)
err(1, "creating sigchild pipe");
sigchld_wfd = fds[1];
if (fcntl(sigchld_wfd, F_SETFL,
fcntl(sigchld_wfd, F_GETFL)|O_NONBLOCK) != 0)
err(1, "setting sigchild pip nonblock");
sigaction(SIGCHLD, &sigchild, NULL);
return fds[0];
}
/*~ This removes the SIGCHLD handler, so we don't try to write
* to a broken pipe. */
static void remove_sigchild_handler(struct io_conn *sigchld_conn)
{
struct sigaction sigchild;
memset(&sigchild, 0, sizeof(struct sigaction));
sigchild.sa_handler = SIG_DFL;
sigaction(SIGCHLD, &sigchild, NULL);
io_close(sigchld_conn);
}
/*~ This is the routine which sets up the sigchild handling. We just
* reap them for now so they don't become zombies, but our subd
* handling calls waitpid() synchronously, so we can't simply do this
* in the signal handler or set SIGCHLD to be ignored, which has the
* same effect.
*
* We can usually ignore these because we keep pipes to our children,
* and use the closure of those to indicate termination.
*/
static struct io_plan *sigchld_rfd_in(struct io_conn *conn,
struct lightningd *ld)
{
/* We don't actually care what we read, so we stuff things here. */
static u8 ignorebuf;
static size_t len;
pid_t childpid;
int wstatus;
/* Reap the plugins, since we otherwise ignore them. */
while ((childpid = waitpid(-1, &wstatus, WNOHANG)) != 0) {
maybe_subd_child(ld, childpid, wstatus);
}
return io_read_partial(conn, &ignorebuf, 1, &len, sigchld_rfd_in, ld);
}
/*~ We actually keep more than one set of features, used in different
* contexts. common/features.c knows how each standard feature is
* presented, so we have it generate the set for each one at a time, and
* combine them.
*
* This is inefficient, but the primitives are useful for adding single
* features later, or adding them when supplied by plugins. */
static struct feature_set *default_features(const tal_t *ctx)
{
struct feature_set *ret = NULL;
static const u32 features[] = {
OPTIONAL_FEATURE(OPT_DATA_LOSS_PROTECT),
OPTIONAL_FEATURE(OPT_UPFRONT_SHUTDOWN_SCRIPT),
OPTIONAL_FEATURE(OPT_GOSSIP_QUERIES),
COMPULSORY_FEATURE(OPT_VAR_ONION),
COMPULSORY_FEATURE(OPT_PAYMENT_SECRET),
OPTIONAL_FEATURE(OPT_BASIC_MPP),
OPTIONAL_FEATURE(OPT_GOSSIP_QUERIES_EX),
OPTIONAL_FEATURE(OPT_STATIC_REMOTEKEY),
OPTIONAL_FEATURE(OPT_SHUTDOWN_ANYSEGWIT),
OPTIONAL_FEATURE(OPT_PAYMENT_METADATA),
OPTIONAL_FEATURE(OPT_SCID_ALIAS),
OPTIONAL_FEATURE(OPT_ZEROCONF),
OPTIONAL_FEATURE(OPT_CHANNEL_TYPE),
#if EXPERIMENTAL_FEATURES
OPTIONAL_FEATURE(OPT_ANCHOR_OUTPUTS),
OPTIONAL_FEATURE(OPT_QUIESCE),
OPTIONAL_FEATURE(OPT_ONION_MESSAGES),
#endif
};
for (size_t i = 0; i < ARRAY_SIZE(features); i++) {
struct feature_set *f
= feature_set_for_feature(NULL, features[i]);
if (!ret)
ret = tal_steal(ctx, f);
else
feature_set_or(ret, take(f));
}
return ret;
}
/*~ We need this function style to hand to ecdh_hsmd_setup, but it's just a thin
* wrapper around fatal() */
static void hsm_ecdh_failed(enum status_failreason fail,
const char *fmt, ...)
{
fatal("hsm failure: %s", fmt);
}
/*~ This signals to the mainloop that some part wants to cleanly exit now. */
void lightningd_exit(struct lightningd *ld, int exit_code)
{
ld->exit_code = tal(ld, int);
*ld->exit_code = exit_code;
log_debug(ld->log, "io_break: %s", __func__);
io_break(ld);
}
int main(int argc, char *argv[])
{
struct lightningd *ld;
u32 min_blockheight, max_blockheight;
int connectd_gossipd_fd;
int stop_fd;
struct timers *timers;
const char *stop_response;
struct htlc_in_map *unconnected_htlcs_in;
struct ext_key *bip32_base;
int sigchld_rfd;
struct io_conn *sigchld_conn;
int exit_code = 0;
char **orig_argv;
bool try_reexec;
/*~ We fork out new processes very very often; every channel gets its
* own process, for example, and we have `hsmd` and `gossipd` and
* the plugins as well.
* Now, we also keep around several file descriptors (`fd`s), including
* file descriptors to communicate with `hsmd` which is a privileged
* process with access to private keys and is therefore very sensitive.
* Thus, we need to close all file descriptors other than what the
* forked-out new process should have ASAP.
*
* We do this by using the `ccan/closefrom` module, which implements
* an emulation for the `closefrom` syscall on BSD and Solaris.
* This emulation tries to use the fastest facility available on the
* system (`close_range` syscall on Linux 5.9+, snooping through
* `/proc/$PID/fd` on many OSs (but requires procps to be mounted),
* the actual `closefrom` call if available, etc.).
* As a fallback if none of those are available on the system, however,
* it just iterates over the theoretical range of possible file
* descriptors.
*
* On some systems, that theoretical range can be very high, up to
* `INT_MAX` in the worst case.
* If the `closefrom` emulation has to fall back to this loop, it
* can be very slow; fortunately, the emulation will also inform
* us of that via the `closefrom_may_be_slow` function, and also has
* `closefrom_limit` to limit the number of allowed file descriptors
* *IF AND ONLY IF* `closefrom_may_be_slow()` is true.
*
* On systems with a fast `closefrom` then `closefrom_limit` does
* nothing.
*
* Previously we always imposed a limit of 1024 file descriptors
* (because we used to always iterate up to limit instead of using
* some OS facility, because those were non-portable and needed
* code for each OS), until @whitslack went and made >1000 channels
* and hit the 1024 limit.
*/
closefrom_limit(4096);
/*~ What happens in strange locales should stay there. */
setup_locale();
/*~ This sets up SIGCHLD to make sigchld_rfd readable. */
sigchld_rfd = setup_sig_handlers();
/*~ This checks that the system-installed libraries (usually
* dynamically linked) actually are compatible with the ones we
* compiled with.
*
* The header itself is auto-generated every time the version of the
* installed libraries changes, as we had an sqlite3 version update
* which broke people, and "make" didn't think there was any work to
* do, so rebuilding didn't fix it. */
check_linked_library_versions();
/*~ Every daemon calls this in some form: the hooks are for dumping
* backtraces when we crash (if supported on this platform). */
daemon_setup(argv[0], log_backtrace_print, log_backtrace_exit);
/*~ There's always a battle between what a constructor like this
* should do, and what should be added later by the caller. In
* general, because we use valgrind heavily for testing, we prefer not
* to initialize unused fields which we expect the caller to set:
* valgrind will warn us if we make decisions based on uninitialized
* variables. */
ld = new_lightningd(NULL);
ld->state = LD_STATE_RUNNING;
/*~ We store an copy of our arguments before parsing mangles them, so
* we can re-exec if versions of subdaemons change. Note the use of
* notleak() since our leak-detector can't find orig_argv on the
* stack. */
orig_argv = notleak(tal_arr(ld, char *, argc + 1));
for (size_t i = 1; i < argc; i++)
orig_argv[i] = tal_strdup(orig_argv, argv[i]);
/*~ Turn argv[0] into an absolute path (if not already) */
orig_argv[0] = path_join(orig_argv, take(path_cwd(NULL)), argv[0]);
orig_argv[argc] = NULL;
/* Figure out where our daemons are first. */
ld->daemon_dir = find_daemon_dir(ld, argv[0]);
if (!ld->daemon_dir)
errx(EXITCODE_SUBDAEMON_FAIL, "Could not find daemons");
/* Set up the feature bits for what we support */
ld->our_features = default_features(ld);
/*~ Handle early options; this moves us into --lightning-dir.
* Plugins may add new options, which is why we are splitting
* between early args (including --plugin registration) and
* non-early opts. This also forks if they say --daemon. */
handle_early_opts(ld, argc, argv);
/*~ Set the default portnum according to the used network
* similarly to what Bitcoin Core does to ports by default. */
ld->portnum = chainparams_get_ln_port(chainparams);
/*~ Initialize all the plugins we just registered, so they can
* do their thing and tell us about themselves (including
* options registration). */
plugins_init(ld->plugins);
/*~ If the plugis are misconfigured we don't want to proceed. A
* misconfiguration could for example be a plugin marked as important
* not working correctly or a plugin squatting something an important
* plugin needs to register, such as a method or CLI option. If we are
* going to shut down immediately again, we shouldn't spend too much
* effort in starting up.
*/
if (ld->exit_code)
fatal("Could not initialize the plugins, see above for details.");
/*~ Handle options and config. */
handle_opts(ld, argc, argv);
/*~ Now create the PID file: this errors out if there's already a
* daemon running, so we call before doing almost anything else. */
pidfile_create(ld);
/*~ Make sure we can reach the subdaemons, and versions match.
* This can be turned off in DEVELOPER builds with --dev-skip-version-checks,
* but the `dev_no_version_checks` field of `ld` doesn't even exist
* if DEVELOPER isn't defined, so we use IFDEV(devoption,non-devoption):
*/
if (IFDEV(!ld->dev_no_version_checks, 1))
test_subdaemons(ld);
/*~ Set up the HSM daemon, which knows our node secret key, so tells
* us who we are.
*
* HSM stands for Hardware Security Module, which is the industry
* standard of key storage; ours is in software for now, so the name
* doesn't really make sense, but we can't call it the Badly-named
* Daemon Software Module. */
bip32_base = hsm_init(ld);
/*~ Our "wallet" code really wraps the db, which is more than a simple
* bitcoin wallet (though it's that too). It also stores channel
* states, invoices, payments, blocks and bitcoin transactions. */
ld->wallet = wallet_new(ld, ld->timers, bip32_base);
/*~ We keep a filter of scriptpubkeys we're interested in. */
ld->owned_txfilter = txfilter_new(ld);
/*~ This is the ccan/io central poll override from above. */
io_poll_override(io_poll_lightningd);
/*~ If hsm_secret is encrypted, we don't need its encryption key
* anymore. Note that sodium_munlock() also zeroes the memory.*/
if (ld->config.keypass)
discard_key(take(ld->config.keypass));
/*~ Our default color and alias are derived from our node id, so we
* can only set those now (if not set by config options). */
setup_color_and_alias(ld);
/*~ Set up connect daemon: this manages receiving and making
* TCP connections. It needs to talk to the gossip daemon
* which knows (via node_announcement messages) the public
* addresses of nodes, so connectd_init hands it one end of a
* socket pair, and gives us the other */
connectd_gossipd_fd = connectd_init(ld);
/*~ We do every database operation within a transaction; usually this
* is covered by the infrastructure (eg. opening a transaction before
* handling a message or expiring a timer), but for startup we do this
* explicitly. */
db_begin_transaction(ld->wallet->db);
/*~ Our default names, eg. for the database file, are not dependent on
* the network. Instead, the db knows what chain it belongs to, and we
* simple barf here if it's wrong.
*
* We also check that our node_id is what we expect: otherwise a change
* in hsm_secret will have strange consequences! */
if (!wallet_sanity_check(ld->wallet))
errx(EXITCODE_WALLET_DB_MISMATCH, "Wallet sanity check failed.");
/*~ Initialize the transaction filter with our pubkeys. */
init_txfilter(ld->wallet, ld->owned_txfilter);
/*~ Get the blockheight we are currently at, UINT32_MAX is used to signal
* an uninitialized wallet and that we should start off of bitcoind's
* current height */
wallet_blocks_heights(ld->wallet, UINT32_MAX,
&min_blockheight, &max_blockheight);
/*~ If we were asked to rescan from an absolute height (--rescan < 0)
* then just go there. Otherwise compute the diff to our current height,
* lowerbounded by 0. */
if (ld->config.rescan < 0)
max_blockheight = -ld->config.rescan;
else if (max_blockheight < (u32)ld->config.rescan)
max_blockheight = 0;
else if (max_blockheight != UINT32_MAX)
max_blockheight -= ld->config.rescan;
/*~ That's all of the wallet db operations for now. */
db_commit_transaction(ld->wallet->db);
/*~ Initialize block topology. This does its own io_loop to
* talk to bitcoind, so does its own db transactions. */
setup_topology(ld->topology, min_blockheight, max_blockheight);
db_begin_transaction(ld->wallet->db);
/*~ Pull peers, channels and HTLCs from db. Needs to happen after the
* topology is initialized since some decisions rely on being able to
* know the blockheight. */
unconnected_htlcs_in = load_channels_from_wallet(ld);
db_commit_transaction(ld->wallet->db);
/*~ The gossip daemon looks after the routing gossip;
* channel_announcement, channel_update, node_announcement and gossip
* queries. It also hands us the latest channel_updates for our
* channels. */
gossip_init(ld, connectd_gossipd_fd);
/*~ Create RPC socket: now lightning-cli can send us JSON RPC commands
* over a UNIX domain socket specified by `ld->rpc_filename`. */
jsonrpc_listen(ld->jsonrpc, ld);
/*~ Now that the rpc path exists, we can start the plugins and they
* can start talking to us. */
plugins_config(ld->plugins);
/*~ Process any HTLCs we were in the middle of when we exited, now
* that plugins (who might want to know via htlc_accepted hook) are
* active. These will immediately fail, since no peers are connected,
* however partial payments may still be absorbed into htlc_set. */
db_begin_transaction(ld->wallet->db);
htlcs_resubmit(ld, unconnected_htlcs_in);
db_commit_transaction(ld->wallet->db);
/*~ Activate connect daemon. Needs to be after the initialization of
* chaintopology, otherwise peers may connect and ask for
* uninitialized data. */
connectd_activate(ld);
/*~ "onchaind" is a dumb daemon which tries to get our funds back: it
* doesn't handle reorganizations, but it's idempotent, so we can
* simply just restart it if the chain moves. Similarly, we replay it
* chain events from the database on restart, beginning with the
* "funding transaction spent" event which creates it. */
onchaind_replay_channels(ld);
/*~ Now handle sigchld, so we can clean up appropriately. */
sigchld_conn = notleak(io_new_conn(ld, sigchld_rfd, sigchld_rfd_in, ld));
/*~ Mark ourselves live.
*
* Note the use of type_to_string() here: it's a typesafe formatter,
* often handed 'tmpctx' like here to allocate a throwaway string for
* formatting. json_escape() avoids printing weird characters in our
* log. And tal_hex() is a helper from utils which returns a hex string;
* it's assumed that the argument was allocated with tal or tal_arr
* so it can use tal_bytelen() to get the length. */
log_info(ld->log, "--------------------------------------------------");
log_info(ld->log, "Server started with public key %s, alias %s (color #%s) and lightningd %s",
type_to_string(tmpctx, struct node_id, &ld->id),
json_escape(tmpctx, (const char *)ld->alias)->s,
tal_hex(tmpctx, ld->rgb), version());
/*~ If `closefrom_may_be_slow`, we limit ourselves to 4096 file
* descriptors; tell the user about it as that limits the number
* of channels they can have.
* We do not really expect most users to ever reach that many,
* but: https://github.com/ElementsProject/lightning/issues/4868
*/
if (closefrom_may_be_slow())
log_info(ld->log,
"We have self-limited number of open file "
"descriptors to 4096, but that will result in a "
"'Too many open files' error if you ever reach "
">4000 channels. Please upgrade your OS kernel "
"(Linux 5.9+, FreeBSD 8.0+), or mount proc or "
"/dev/fd (if running in chroot) if you are "
"approaching that many channels.");
/*~ If we have channels closing, make sure we re-xmit the last
* transaction, in case bitcoind lost it. */
db_begin_transaction(ld->wallet->db);
resend_closing_transactions(ld);
db_commit_transaction(ld->wallet->db);
/*~ This is where we ask connectd to reconnect to any peers who have
* live channels with us, and makes sure we're watching the funding
* tx. */
setup_peers(ld);
/*~ Now that all the notifications for transactions are in place, we
* can start the poll loop which queries bitcoind for new blocks. */
begin_topology(ld->topology);
/*~ To handle --daemon, we fork the daemon early (otherwise we hit
* issues with our pid changing), but keep the parent around until
* we've completed most initialization: that way we'll exit with an
* error rather than silently exiting 0, then realizing we can't start
* and forcing the confused user to read the logs.
*
* But we're all initialized, so detach and have parent exit now. */
if (ld->daemon_parent_fd != -1)
complete_daemonize(ld);
/*~ Setting this (global) activates the crash log: we don't usually need
* a backtrace if we fail during startup. */
crashlog = ld->log;
/*~ This sets up the ecdh() function in ecdh_hsmd to talk to hsmd */
ecdh_hsmd_setup(ld->hsm_fd, hsm_ecdh_failed);
/*~ The root of every backtrace (almost). This is our main event
* loop. */
void *io_loop_ret = io_loop_with_timers(ld);
/*~ io_loop_with_timers will only exit if we call io_break.
* At this point in code, we should use io_break(ld) to
* shut down.
*/
assert(io_loop_ret == ld);
log_debug(ld->log, "io_loop_with_timers: %s", __func__);
/* Stop *new* JSON RPC requests. */
jsonrpc_stop_listening(ld->jsonrpc);
/* Give permission for things to get destroyed without getting upset. */
ld->state = LD_STATE_SHUTDOWN;
stop_fd = -1;
stop_response = NULL;
/* Were we exited via `lightningd_exit`? */
if (ld->exit_code) {
exit_code = *ld->exit_code;
} else if (ld->stop_conn) {
/* Keep this fd around, to write final response at the end. */
stop_fd = io_conn_fd(ld->stop_conn);
io_close_taken_fd(ld->stop_conn);
stop_response = tal_steal(NULL, ld->stop_response);
}
/* Stop topology callbacks. */
stop_topology(ld->topology);
/* We're not going to collect our children. */
remove_sigchild_handler(sigchld_conn);
/* Get rid of per-channel subdaemons. */
subd_shutdown_nonglobals(ld);
/* Tell plugins we're shutting down, use force if necessary. */
shutdown_plugins(ld);
/* Now kill any remaining connections */
jsonrpc_stop_all(ld);
/* Get rid of major subdaemons. */
shutdown_global_subdaemons(ld);
/* Clean up internal peer/channel/htlc structures. */
free_all_channels(ld);
/* Now close database */
ld->wallet->db = tal_free(ld->wallet->db);
/* Clean our our HTLC maps, since they use malloc. */
htlc_in_map_clear(&ld->htlcs_in);
htlc_out_map_clear(&ld->htlcs_out);
remove(ld->pidfile);
/* FIXME: pay can have children off tmpctx which unlink from
* ld->payments, so clean that up. */
clean_tmpctx();
/* Gather these before we free ld! */
try_reexec = ld->try_reexec;
if (try_reexec)
tal_steal(NULL, orig_argv);
/* Free this last: other things may clean up timers. */
timers = tal_steal(NULL, ld->timers);
tal_free(ld);
timers_cleanup(timers);
tal_free(timers);
opt_free_table();
daemon_shutdown();
/* Finally, send response to shutdown command if appropriate. */
if (stop_fd >= 0) {
write_all(stop_fd, stop_response, strlen(stop_response));
close(stop_fd);
tal_free(stop_response);
}
/* Were we supposed to restart ourselves? */
if (try_reexec) {
/* Give a reasonable chance for the install to finish. */
sleep(5);
/* Close all filedescriptors except stdin/stdout/stderr */
closefrom(STDERR_FILENO + 1);
execv(orig_argv[0], orig_argv);
err(1, "Failed to re-exec ourselves after version change");
}
/*~ Farewell. Next stop: hsmd/hsmd.c. */
return exit_code;
}