gossipd: check that gossmap code sees updates from gossip_store writes.

After analyzing various weird cases where we ended up with duplicate
gossip_store entries, it could be explained by us not fully processing
the gossip store.

It's not clear that my assumptions that we would always see our own writes
are true: technically this may require an fsync().  So we now add the
check, and do an fsync and try again.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Changelog-Fixed: gossipd: more sanity checks that we are correctly updating the gossip_store file.
This commit is contained in:
Rusty Russell 2025-02-10 16:37:58 +10:30 committed by Alex Myers
parent 8156c83e11
commit 2b4b1479ed
5 changed files with 80 additions and 1 deletions

View file

@ -1851,3 +1851,17 @@ int gossmap_fd(const struct gossmap *map)
{
return map->fd;
}
const u8 *gossmap_fetch_tail(const tal_t *ctx, const struct gossmap *map)
{
size_t len;
u8 *p;
/* Shouldn't happen... */
if (map->map_end > map->map_size)
return NULL;
len = map->map_size - map->map_end;
p = tal_arr(ctx, u8, len);
map_copy(map, map->map_size, p, len);
return p;
}

View file

@ -304,4 +304,7 @@ u64 gossmap_lengths(const struct gossmap *map, u64 *total);
/* Debugging: connectd wants to enumerate fds */
int gossmap_fd(const struct gossmap *map);
/* Fetch unprocessed part of gossmap */
const u8 *gossmap_fetch_tail(const tal_t *ctx, const struct gossmap *map);
#endif /* LIGHTNING_COMMON_GOSSMAP_H */

View file

@ -408,7 +408,7 @@ struct gossip_store *gossip_store_new(const tal_t *ctx,
return gs;
}
static void gossip_store_fsync(const struct gossip_store *gs)
void gossip_store_fsync(const struct gossip_store *gs)
{
if (fsync(gs->fd) != 0)
status_failed(STATUS_FAIL_INTERNAL_ERROR,

View file

@ -110,5 +110,6 @@ void gossip_store_set_timestamp(struct gossip_store *gs, u64 offset, u32 timesta
* For debugging.
*/
u64 gossip_store_len_written(const struct gossip_store *gs);
void gossip_store_fsync(const struct gossip_store *gs);
#endif /* LIGHTNING_GOSSIPD_GOSSIP_STORE_H */

View file

@ -1365,9 +1365,70 @@ void gossmap_manage_channel_spent(struct gossmap_manage *gm,
}
}
/* Fetch the part of the gossmap we didn't process via read() */
static const u8 *fetch_tail_fd(const tal_t *ctx,
int gossmap_fd,
u64 map_used, u64 map_size)
{
size_t len;
ssize_t r;
u8 *p;
/* Shouldn't happen... */
if (map_used > map_size)
return NULL;
len = map_size - map_used;
p = tal_arrz(ctx, u8, len);
r = pread(gossmap_fd, p, len, map_used);
if (r != len)
status_broken("Partial read on gossmap EOF (%zi vs %zu)",
r, len);
return p;
}
struct gossmap *gossmap_manage_get_gossmap(struct gossmap_manage *gm)
{
u64 map_used, map_size, written_len;
gossmap_refresh(gm->raw_gossmap);
/* Sanity check that we see everything we wrote. */
map_used = gossmap_lengths(gm->raw_gossmap, &map_size);
written_len = gossip_store_len_written(gm->gs);
if (map_size != written_len) {
status_broken("gossmap size %"PRIu64" != written size %"PRIu64,
map_size, written_len);
/* Push harder! */
gossip_store_fsync(gm->gs);
gossmap_refresh(gm->raw_gossmap);
/* Sanity check that we see everything we wrote. */
map_used = gossmap_lengths(gm->raw_gossmap, &map_size);
written_len = gossip_store_len_written(gm->gs);
if (map_used != written_len || map_size != map_used)
status_failed(STATUS_FAIL_INTERNAL_ERROR,
"gossmap read inconsistent even after sync"
" used=%"PRIu64" seen=%"PRIu64" written=%"PRIu64,
map_used, map_size, written_len);
} else if (map_size != map_used) {
const u8 *remainder_fd, *remainder_mmap;
/* Sigh. Did gossmap see something different (via mmap)
* from what we see via read? It's possible it's caught up
* now, but just in case, log BOTH */
remainder_mmap = gossmap_fetch_tail(tmpctx, gm->raw_gossmap);
remainder_fd = fetch_tail_fd(tmpctx,
gossmap_fd(gm->raw_gossmap),
map_used, map_size);
status_failed(STATUS_FAIL_INTERNAL_ERROR,
"Gossmap failed to process entire gossip_store: "
"at %"PRIu64" of %"PRIu64" remaining_mmap=%s remaining_fd=%s",
map_used, map_size,
tal_hex(tmpctx, remainder_mmap),
tal_hex(tmpctx, remainder_fd));
}
return gm->raw_gossmap;
}