mirror of
https://github.com/ElementsProject/lightning.git
synced 2025-01-18 21:35:11 +01:00
gossipd: remove gossip entirely if we hit a problem on load.
The crashes in #2750 are mostly caused by us trying to partially truncate the store. The simplest fix for release is to discard the whole thing if we detect a problem. This is a workaround: it'd be far nicer to try to recover. Fixes: #2750 Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
parent
8ce3b86aa5
commit
8928f0b5f9
@ -525,44 +525,6 @@ int gossip_store_readonly_fd(struct gossip_store *gs)
|
||||
return fd;
|
||||
}
|
||||
|
||||
/* If we ever truncated, we might have a dangling entries. */
|
||||
static void cleanup_truncated_store(struct routing_state *rstate,
|
||||
struct gossip_store *gs,
|
||||
u32 chan_ann_off)
|
||||
{
|
||||
size_t num;
|
||||
u32 index;
|
||||
|
||||
/* channel_announce with no channel_amount. */
|
||||
if (chan_ann_off) {
|
||||
status_unusual("Deleting un-amounted channel_announcement @%u",
|
||||
chan_ann_off);
|
||||
delete_by_index(gs, chan_ann_off, WIRE_CHANNEL_ANNOUNCEMENT);
|
||||
}
|
||||
|
||||
num = 0;
|
||||
while ((index = remove_unfinalized_node_announce(rstate)) != 0) {
|
||||
delete_by_index(gs, index, WIRE_NODE_ANNOUNCEMENT);
|
||||
num++;
|
||||
}
|
||||
if (num)
|
||||
status_unusual("Deleted %zu unfinalized node_announcements",
|
||||
num);
|
||||
|
||||
num = 0;
|
||||
while ((index = remove_unupdated_channel_announce(rstate)) != 0) {
|
||||
u32 next;
|
||||
|
||||
/* Delete announcement and channel amount, too */
|
||||
next = delete_by_index(gs, index, WIRE_CHANNEL_ANNOUNCEMENT);
|
||||
delete_by_index(gs, next, WIRE_GOSSIP_STORE_CHANNEL_AMOUNT);
|
||||
num++;
|
||||
}
|
||||
if (num)
|
||||
status_unusual("Deleted %zu unupdated channel_announcements",
|
||||
num);
|
||||
}
|
||||
|
||||
bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
|
||||
{
|
||||
struct gossip_hdr hdr;
|
||||
@ -672,25 +634,36 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
|
||||
clean_tmpctx();
|
||||
}
|
||||
|
||||
if (chan_ann) {
|
||||
status_unusual("gossip_store: dangling channel_announcement");
|
||||
goto truncate_nomsg;
|
||||
}
|
||||
|
||||
bad = unfinalized_entries(tmpctx, rstate);
|
||||
if (bad) {
|
||||
status_unusual("gossip_store: %s", bad);
|
||||
goto truncate_nomsg;
|
||||
}
|
||||
|
||||
/* If last timestamp is within 24 hours, say we're OK. */
|
||||
contents_ok = (last_timestamp >= time_now().ts.tv_sec - 24*3600);
|
||||
goto out;
|
||||
|
||||
truncate:
|
||||
status_unusual("gossip_store: %s (%s) truncating to %"PRIu64,
|
||||
bad, tal_hex(msg, msg), gs->len);
|
||||
status_unusual("gossip_store: %s (%s) truncating",
|
||||
bad, tal_hex(msg, msg));
|
||||
|
||||
truncate_nomsg:
|
||||
/* FIXME: We would like to truncate to known_good, except we would
|
||||
* miss channel_delete msgs. If we put block numbers into the store
|
||||
* as we process them, we can know how far we need to roll back if we
|
||||
* truncate the store */
|
||||
if (ftruncate(gs->fd, gs->len) != 0)
|
||||
/* FIXME: Debug partial truncate case. */
|
||||
if (ftruncate(gs->fd, 1) != 0)
|
||||
status_failed(STATUS_FAIL_INTERNAL_ERROR,
|
||||
"Truncating store: %s", strerror(errno));
|
||||
remove_all_gossip(rstate);
|
||||
gs->count = gs->deleted = 0;
|
||||
gs->len = 1;
|
||||
contents_ok = false;
|
||||
out:
|
||||
gs->writable = true;
|
||||
cleanup_truncated_store(rstate, gs, chan_ann ? chan_ann_off : 0);
|
||||
status_trace("total store load time: %"PRIu64" msec",
|
||||
time_to_msec(time_between(time_now(), start)));
|
||||
status_trace("gossip_store: Read %zu/%zu/%zu/%zu cannounce/cupdate/nannounce/cdelete from store (%zu deleted) in %"PRIu64" bytes",
|
||||
|
@ -2583,42 +2583,63 @@ struct timeabs gossip_time_now(const struct routing_state *rstate)
|
||||
return time_now();
|
||||
}
|
||||
|
||||
/* gossip_store wants to delete any dangling node_announcement msgs */
|
||||
u32 remove_unfinalized_node_announce(struct routing_state *rstate)
|
||||
{
|
||||
/* We're only interested in node_announcement we caught. */
|
||||
for (;;) {
|
||||
struct pending_node_announce *pna;
|
||||
struct pending_node_map_iter it;
|
||||
|
||||
pna = pending_node_map_first(rstate->pending_node_map, &it);
|
||||
if (!pna)
|
||||
return 0;
|
||||
|
||||
/* This will be deleted by the associated unupdated_channel; just
|
||||
* remove from map for now. */
|
||||
pending_node_map_del(rstate->pending_node_map, pna);
|
||||
if (!pna->node_announcement)
|
||||
continue;
|
||||
|
||||
assert(pna->index);
|
||||
return pna->index;
|
||||
}
|
||||
}
|
||||
|
||||
/* gossip_store wants to delete any dangling channel_announcement msgs */
|
||||
u32 remove_unupdated_channel_announce(struct routing_state *rstate)
|
||||
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate)
|
||||
{
|
||||
struct unupdated_channel *uc;
|
||||
u64 index;
|
||||
struct pending_node_announce *pna;
|
||||
struct pending_node_map_iter it;
|
||||
|
||||
uc = uintmap_first(&rstate->unupdated_chanmap, &index);
|
||||
if (!uc)
|
||||
return 0;
|
||||
if (uc)
|
||||
return tal_fmt(ctx, "Unupdated channel_announcement at %u",
|
||||
uc->index);
|
||||
|
||||
assert(uc->index);
|
||||
index = uc->index;
|
||||
pna = pending_node_map_first(rstate->pending_node_map, &it);
|
||||
if (pna)
|
||||
return tal_fmt(ctx, "Waiting node_announcement at %u",
|
||||
pna->index);
|
||||
|
||||
tal_free(uc);
|
||||
return index;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Gossip store was corrupt, forget anything we loaded. */
|
||||
void remove_all_gossip(struct routing_state *rstate)
|
||||
{
|
||||
struct node *n;
|
||||
struct node_map_iter nit;
|
||||
struct chan *c;
|
||||
struct unupdated_channel *uc;
|
||||
u64 index;
|
||||
struct pending_cannouncement *pca;
|
||||
struct pending_cannouncement_map_iter pit;
|
||||
struct pending_node_map_iter pnait;
|
||||
|
||||
/* We don't want them to try to delete from store, so do this
|
||||
* manually. */
|
||||
while ((n = node_map_first(rstate->nodes, &nit)) != NULL) {
|
||||
tal_del_destructor2(n, destroy_node, rstate);
|
||||
if (node_uses_chan_map(n))
|
||||
chan_map_clear(&n->chans.map);
|
||||
node_map_del(rstate->nodes, n);
|
||||
tal_free(n);
|
||||
}
|
||||
|
||||
/* Now free all the channels. */
|
||||
while ((c = uintmap_first(&rstate->chanmap, &index)) != NULL) {
|
||||
uintmap_del(&rstate->chanmap, index);
|
||||
|
||||
/* Remove from local_disabled_map if it's there. */
|
||||
chan_map_del(&rstate->local_disabled_map, c);
|
||||
tal_free(c);
|
||||
}
|
||||
|
||||
while ((uc = uintmap_first(&rstate->unupdated_chanmap, &index)) != NULL)
|
||||
tal_free(uc);
|
||||
|
||||
while ((pca = pending_cannouncement_map_first(&rstate->pending_cannouncements, &pit)) != NULL)
|
||||
tal_free(pca);
|
||||
|
||||
/* Freeing unupdated chanmaps should empty this */
|
||||
assert(pending_node_map_first(rstate->pending_node_map, &pnait) == NULL);
|
||||
}
|
||||
|
@ -437,12 +437,8 @@ struct wireaddr *read_addresses(const tal_t *ctx, const u8 *ser);
|
||||
void remove_channel_from_store(struct routing_state *rstate,
|
||||
struct chan *chan);
|
||||
|
||||
/* gossip_store wants to delete any dangling entries immediately after
|
||||
* load; return 0 if no more, otherwise index into store.
|
||||
*
|
||||
* Must call remove_unfinalized_node_announce first, because removing
|
||||
* unupdated channels may delete associatd node_announcements. */
|
||||
u32 remove_unfinalized_node_announce(struct routing_state *rstate);
|
||||
u32 remove_unupdated_channel_announce(struct routing_state *rstate);
|
||||
/* Returns an error string if there are unfinalized entries after load */
|
||||
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate);
|
||||
|
||||
void remove_all_gossip(struct routing_state *rstate);
|
||||
#endif /* LIGHTNING_GOSSIPD_ROUTING_H */
|
||||
|
@ -945,8 +945,8 @@ def test_gossip_store_load_amount_truncated(node_factory):
|
||||
|
||||
l1.start()
|
||||
# May preceed the Started msg waited for in 'start'.
|
||||
wait_for(lambda: l1.daemon.is_in_log(r'Deleting un-amounted channel_announcement @1'))
|
||||
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(1 deleted\) in 445 bytes'))
|
||||
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: dangling channel_announcement'))
|
||||
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(0 deleted\) in 1 bytes'))
|
||||
assert not l1.daemon.is_in_log('gossip_store.*truncating')
|
||||
|
||||
# Extra sanity check if we can.
|
||||
|
Loading…
Reference in New Issue
Block a user