From 8928f0b5f95abf7cbc752e21f0b1c9571b95d030 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 20 Jun 2019 12:27:52 +0930 Subject: [PATCH] gossipd: remove gossip entirely if we hit a problem on load. The crashes in #2750 are mostly caused by us trying to partially truncate the store. The simplest fix for release is to discard the whole thing if we detect a problem. This is a workaround: it'd be far nicer to try to recover. Fixes: #2750 Signed-off-by: Rusty Russell --- gossipd/gossip_store.c | 65 ++++++++++----------------------- gossipd/routing.c | 83 ++++++++++++++++++++++++++---------------- gossipd/routing.h | 10 ++--- tests/test_gossip.py | 4 +- 4 files changed, 76 insertions(+), 86 deletions(-) diff --git a/gossipd/gossip_store.c b/gossipd/gossip_store.c index 507d8f3ae..d259dc267 100644 --- a/gossipd/gossip_store.c +++ b/gossipd/gossip_store.c @@ -525,44 +525,6 @@ int gossip_store_readonly_fd(struct gossip_store *gs) return fd; } -/* If we ever truncated, we might have a dangling entries. */ -static void cleanup_truncated_store(struct routing_state *rstate, - struct gossip_store *gs, - u32 chan_ann_off) -{ - size_t num; - u32 index; - - /* channel_announce with no channel_amount. */ - if (chan_ann_off) { - status_unusual("Deleting un-amounted channel_announcement @%u", - chan_ann_off); - delete_by_index(gs, chan_ann_off, WIRE_CHANNEL_ANNOUNCEMENT); - } - - num = 0; - while ((index = remove_unfinalized_node_announce(rstate)) != 0) { - delete_by_index(gs, index, WIRE_NODE_ANNOUNCEMENT); - num++; - } - if (num) - status_unusual("Deleted %zu unfinalized node_announcements", - num); - - num = 0; - while ((index = remove_unupdated_channel_announce(rstate)) != 0) { - u32 next; - - /* Delete announcement and channel amount, too */ - next = delete_by_index(gs, index, WIRE_CHANNEL_ANNOUNCEMENT); - delete_by_index(gs, next, WIRE_GOSSIP_STORE_CHANNEL_AMOUNT); - num++; - } - if (num) - status_unusual("Deleted %zu unupdated channel_announcements", - num); -} - bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs) { struct gossip_hdr hdr; @@ -672,25 +634,36 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs) clean_tmpctx(); } + if (chan_ann) { + status_unusual("gossip_store: dangling channel_announcement"); + goto truncate_nomsg; + } + + bad = unfinalized_entries(tmpctx, rstate); + if (bad) { + status_unusual("gossip_store: %s", bad); + goto truncate_nomsg; + } + /* If last timestamp is within 24 hours, say we're OK. */ contents_ok = (last_timestamp >= time_now().ts.tv_sec - 24*3600); goto out; truncate: - status_unusual("gossip_store: %s (%s) truncating to %"PRIu64, - bad, tal_hex(msg, msg), gs->len); + status_unusual("gossip_store: %s (%s) truncating", + bad, tal_hex(msg, msg)); + truncate_nomsg: - /* FIXME: We would like to truncate to known_good, except we would - * miss channel_delete msgs. If we put block numbers into the store - * as we process them, we can know how far we need to roll back if we - * truncate the store */ - if (ftruncate(gs->fd, gs->len) != 0) + /* FIXME: Debug partial truncate case. */ + if (ftruncate(gs->fd, 1) != 0) status_failed(STATUS_FAIL_INTERNAL_ERROR, "Truncating store: %s", strerror(errno)); + remove_all_gossip(rstate); + gs->count = gs->deleted = 0; + gs->len = 1; contents_ok = false; out: gs->writable = true; - cleanup_truncated_store(rstate, gs, chan_ann ? chan_ann_off : 0); status_trace("total store load time: %"PRIu64" msec", time_to_msec(time_between(time_now(), start))); status_trace("gossip_store: Read %zu/%zu/%zu/%zu cannounce/cupdate/nannounce/cdelete from store (%zu deleted) in %"PRIu64" bytes", diff --git a/gossipd/routing.c b/gossipd/routing.c index 4d433acd5..92c3f9b15 100644 --- a/gossipd/routing.c +++ b/gossipd/routing.c @@ -2583,42 +2583,63 @@ struct timeabs gossip_time_now(const struct routing_state *rstate) return time_now(); } -/* gossip_store wants to delete any dangling node_announcement msgs */ -u32 remove_unfinalized_node_announce(struct routing_state *rstate) -{ - /* We're only interested in node_announcement we caught. */ - for (;;) { - struct pending_node_announce *pna; - struct pending_node_map_iter it; - - pna = pending_node_map_first(rstate->pending_node_map, &it); - if (!pna) - return 0; - - /* This will be deleted by the associated unupdated_channel; just - * remove from map for now. */ - pending_node_map_del(rstate->pending_node_map, pna); - if (!pna->node_announcement) - continue; - - assert(pna->index); - return pna->index; - } -} - -/* gossip_store wants to delete any dangling channel_announcement msgs */ -u32 remove_unupdated_channel_announce(struct routing_state *rstate) +const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate) { struct unupdated_channel *uc; u64 index; + struct pending_node_announce *pna; + struct pending_node_map_iter it; uc = uintmap_first(&rstate->unupdated_chanmap, &index); - if (!uc) - return 0; + if (uc) + return tal_fmt(ctx, "Unupdated channel_announcement at %u", + uc->index); - assert(uc->index); - index = uc->index; + pna = pending_node_map_first(rstate->pending_node_map, &it); + if (pna) + return tal_fmt(ctx, "Waiting node_announcement at %u", + pna->index); - tal_free(uc); - return index; + return NULL; +} + +/* Gossip store was corrupt, forget anything we loaded. */ +void remove_all_gossip(struct routing_state *rstate) +{ + struct node *n; + struct node_map_iter nit; + struct chan *c; + struct unupdated_channel *uc; + u64 index; + struct pending_cannouncement *pca; + struct pending_cannouncement_map_iter pit; + struct pending_node_map_iter pnait; + + /* We don't want them to try to delete from store, so do this + * manually. */ + while ((n = node_map_first(rstate->nodes, &nit)) != NULL) { + tal_del_destructor2(n, destroy_node, rstate); + if (node_uses_chan_map(n)) + chan_map_clear(&n->chans.map); + node_map_del(rstate->nodes, n); + tal_free(n); + } + + /* Now free all the channels. */ + while ((c = uintmap_first(&rstate->chanmap, &index)) != NULL) { + uintmap_del(&rstate->chanmap, index); + + /* Remove from local_disabled_map if it's there. */ + chan_map_del(&rstate->local_disabled_map, c); + tal_free(c); + } + + while ((uc = uintmap_first(&rstate->unupdated_chanmap, &index)) != NULL) + tal_free(uc); + + while ((pca = pending_cannouncement_map_first(&rstate->pending_cannouncements, &pit)) != NULL) + tal_free(pca); + + /* Freeing unupdated chanmaps should empty this */ + assert(pending_node_map_first(rstate->pending_node_map, &pnait) == NULL); } diff --git a/gossipd/routing.h b/gossipd/routing.h index 392ac4864..b886290f0 100644 --- a/gossipd/routing.h +++ b/gossipd/routing.h @@ -437,12 +437,8 @@ struct wireaddr *read_addresses(const tal_t *ctx, const u8 *ser); void remove_channel_from_store(struct routing_state *rstate, struct chan *chan); -/* gossip_store wants to delete any dangling entries immediately after - * load; return 0 if no more, otherwise index into store. - * - * Must call remove_unfinalized_node_announce first, because removing - * unupdated channels may delete associatd node_announcements. */ -u32 remove_unfinalized_node_announce(struct routing_state *rstate); -u32 remove_unupdated_channel_announce(struct routing_state *rstate); +/* Returns an error string if there are unfinalized entries after load */ +const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate); +void remove_all_gossip(struct routing_state *rstate); #endif /* LIGHTNING_GOSSIPD_ROUTING_H */ diff --git a/tests/test_gossip.py b/tests/test_gossip.py index 6c9629869..54ef6a11b 100644 --- a/tests/test_gossip.py +++ b/tests/test_gossip.py @@ -945,8 +945,8 @@ def test_gossip_store_load_amount_truncated(node_factory): l1.start() # May preceed the Started msg waited for in 'start'. - wait_for(lambda: l1.daemon.is_in_log(r'Deleting un-amounted channel_announcement @1')) - wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(1 deleted\) in 445 bytes')) + wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: dangling channel_announcement')) + wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(0 deleted\) in 1 bytes')) assert not l1.daemon.is_in_log('gossip_store.*truncating') # Extra sanity check if we can.