gossipd: remove gossip entirely if we hit a problem on load.

The crashes in #2750 are mostly caused by us trying to partially truncate
the store.  The simplest fix for release is to discard the whole thing if
we detect a problem.

This is a workaround: it'd be far nicer to try to recover.

Fixes: #2750
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell 2019-06-20 12:27:52 +09:30
parent 8ce3b86aa5
commit 8928f0b5f9
4 changed files with 76 additions and 86 deletions

View File

@ -525,44 +525,6 @@ int gossip_store_readonly_fd(struct gossip_store *gs)
return fd;
}
/* If we ever truncated, we might have a dangling entries. */
static void cleanup_truncated_store(struct routing_state *rstate,
struct gossip_store *gs,
u32 chan_ann_off)
{
size_t num;
u32 index;
/* channel_announce with no channel_amount. */
if (chan_ann_off) {
status_unusual("Deleting un-amounted channel_announcement @%u",
chan_ann_off);
delete_by_index(gs, chan_ann_off, WIRE_CHANNEL_ANNOUNCEMENT);
}
num = 0;
while ((index = remove_unfinalized_node_announce(rstate)) != 0) {
delete_by_index(gs, index, WIRE_NODE_ANNOUNCEMENT);
num++;
}
if (num)
status_unusual("Deleted %zu unfinalized node_announcements",
num);
num = 0;
while ((index = remove_unupdated_channel_announce(rstate)) != 0) {
u32 next;
/* Delete announcement and channel amount, too */
next = delete_by_index(gs, index, WIRE_CHANNEL_ANNOUNCEMENT);
delete_by_index(gs, next, WIRE_GOSSIP_STORE_CHANNEL_AMOUNT);
num++;
}
if (num)
status_unusual("Deleted %zu unupdated channel_announcements",
num);
}
bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
{
struct gossip_hdr hdr;
@ -672,25 +634,36 @@ bool gossip_store_load(struct routing_state *rstate, struct gossip_store *gs)
clean_tmpctx();
}
if (chan_ann) {
status_unusual("gossip_store: dangling channel_announcement");
goto truncate_nomsg;
}
bad = unfinalized_entries(tmpctx, rstate);
if (bad) {
status_unusual("gossip_store: %s", bad);
goto truncate_nomsg;
}
/* If last timestamp is within 24 hours, say we're OK. */
contents_ok = (last_timestamp >= time_now().ts.tv_sec - 24*3600);
goto out;
truncate:
status_unusual("gossip_store: %s (%s) truncating to %"PRIu64,
bad, tal_hex(msg, msg), gs->len);
status_unusual("gossip_store: %s (%s) truncating",
bad, tal_hex(msg, msg));
truncate_nomsg:
/* FIXME: We would like to truncate to known_good, except we would
* miss channel_delete msgs. If we put block numbers into the store
* as we process them, we can know how far we need to roll back if we
* truncate the store */
if (ftruncate(gs->fd, gs->len) != 0)
/* FIXME: Debug partial truncate case. */
if (ftruncate(gs->fd, 1) != 0)
status_failed(STATUS_FAIL_INTERNAL_ERROR,
"Truncating store: %s", strerror(errno));
remove_all_gossip(rstate);
gs->count = gs->deleted = 0;
gs->len = 1;
contents_ok = false;
out:
gs->writable = true;
cleanup_truncated_store(rstate, gs, chan_ann ? chan_ann_off : 0);
status_trace("total store load time: %"PRIu64" msec",
time_to_msec(time_between(time_now(), start)));
status_trace("gossip_store: Read %zu/%zu/%zu/%zu cannounce/cupdate/nannounce/cdelete from store (%zu deleted) in %"PRIu64" bytes",

View File

@ -2583,42 +2583,63 @@ struct timeabs gossip_time_now(const struct routing_state *rstate)
return time_now();
}
/* gossip_store wants to delete any dangling node_announcement msgs */
u32 remove_unfinalized_node_announce(struct routing_state *rstate)
{
/* We're only interested in node_announcement we caught. */
for (;;) {
struct pending_node_announce *pna;
struct pending_node_map_iter it;
pna = pending_node_map_first(rstate->pending_node_map, &it);
if (!pna)
return 0;
/* This will be deleted by the associated unupdated_channel; just
* remove from map for now. */
pending_node_map_del(rstate->pending_node_map, pna);
if (!pna->node_announcement)
continue;
assert(pna->index);
return pna->index;
}
}
/* gossip_store wants to delete any dangling channel_announcement msgs */
u32 remove_unupdated_channel_announce(struct routing_state *rstate)
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate)
{
struct unupdated_channel *uc;
u64 index;
struct pending_node_announce *pna;
struct pending_node_map_iter it;
uc = uintmap_first(&rstate->unupdated_chanmap, &index);
if (!uc)
return 0;
if (uc)
return tal_fmt(ctx, "Unupdated channel_announcement at %u",
uc->index);
assert(uc->index);
index = uc->index;
pna = pending_node_map_first(rstate->pending_node_map, &it);
if (pna)
return tal_fmt(ctx, "Waiting node_announcement at %u",
pna->index);
tal_free(uc);
return index;
return NULL;
}
/* Gossip store was corrupt, forget anything we loaded. */
void remove_all_gossip(struct routing_state *rstate)
{
struct node *n;
struct node_map_iter nit;
struct chan *c;
struct unupdated_channel *uc;
u64 index;
struct pending_cannouncement *pca;
struct pending_cannouncement_map_iter pit;
struct pending_node_map_iter pnait;
/* We don't want them to try to delete from store, so do this
* manually. */
while ((n = node_map_first(rstate->nodes, &nit)) != NULL) {
tal_del_destructor2(n, destroy_node, rstate);
if (node_uses_chan_map(n))
chan_map_clear(&n->chans.map);
node_map_del(rstate->nodes, n);
tal_free(n);
}
/* Now free all the channels. */
while ((c = uintmap_first(&rstate->chanmap, &index)) != NULL) {
uintmap_del(&rstate->chanmap, index);
/* Remove from local_disabled_map if it's there. */
chan_map_del(&rstate->local_disabled_map, c);
tal_free(c);
}
while ((uc = uintmap_first(&rstate->unupdated_chanmap, &index)) != NULL)
tal_free(uc);
while ((pca = pending_cannouncement_map_first(&rstate->pending_cannouncements, &pit)) != NULL)
tal_free(pca);
/* Freeing unupdated chanmaps should empty this */
assert(pending_node_map_first(rstate->pending_node_map, &pnait) == NULL);
}

View File

@ -437,12 +437,8 @@ struct wireaddr *read_addresses(const tal_t *ctx, const u8 *ser);
void remove_channel_from_store(struct routing_state *rstate,
struct chan *chan);
/* gossip_store wants to delete any dangling entries immediately after
* load; return 0 if no more, otherwise index into store.
*
* Must call remove_unfinalized_node_announce first, because removing
* unupdated channels may delete associatd node_announcements. */
u32 remove_unfinalized_node_announce(struct routing_state *rstate);
u32 remove_unupdated_channel_announce(struct routing_state *rstate);
/* Returns an error string if there are unfinalized entries after load */
const char *unfinalized_entries(const tal_t *ctx, struct routing_state *rstate);
void remove_all_gossip(struct routing_state *rstate);
#endif /* LIGHTNING_GOSSIPD_ROUTING_H */

View File

@ -945,8 +945,8 @@ def test_gossip_store_load_amount_truncated(node_factory):
l1.start()
# May preceed the Started msg waited for in 'start'.
wait_for(lambda: l1.daemon.is_in_log(r'Deleting un-amounted channel_announcement @1'))
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(1 deleted\) in 445 bytes'))
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: dangling channel_announcement'))
wait_for(lambda: l1.daemon.is_in_log(r'gossip_store: Read 0/0/0/0 cannounce/cupdate/nannounce/cdelete from store \(0 deleted\) in 1 bytes'))
assert not l1.daemon.is_in_log('gossip_store.*truncating')
# Extra sanity check if we can.