askrene: give better feedback when we can't find a suitable route.

This turns out to be critical for users: also stops them from
bothering us when their node is offline or has insufficient capacity!

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell 2024-10-04 09:09:53 +09:30
parent e54c0f8ded
commit 630ec6a566
9 changed files with 404 additions and 12 deletions

View file

@ -1,5 +1,5 @@
PLUGIN_ASKRENE_SRC := plugins/askrene/askrene.c plugins/askrene/layer.c plugins/askrene/reserve.c plugins/askrene/mcf.c plugins/askrene/dijkstra.c plugins/askrene/flow.c plugins/askrene/refine.c
PLUGIN_ASKRENE_HEADER := plugins/askrene/askrene.h plugins/askrene/layer.h plugins/askrene/reserve.h plugins/askrene/mcf.h plugins/askrene/dijkstra.h plugins/askrene/flow.h plugins/askrene/refine.h
PLUGIN_ASKRENE_SRC := plugins/askrene/askrene.c plugins/askrene/layer.c plugins/askrene/reserve.c plugins/askrene/mcf.c plugins/askrene/dijkstra.c plugins/askrene/flow.c plugins/askrene/refine.c plugins/askrene/explain_failure.c
PLUGIN_ASKRENE_HEADER := plugins/askrene/askrene.h plugins/askrene/layer.h plugins/askrene/reserve.h plugins/askrene/mcf.h plugins/askrene/dijkstra.h plugins/askrene/flow.h plugins/askrene/refine.h plugins/askrene/explain_failure.h
PLUGIN_ASKRENE_OBJS := $(PLUGIN_ASKRENE_SRC:.c=.o)
$(PLUGIN_ASKRENE_OBJS): $(PLUGIN_ASKRENE_HEADER)
@ -7,4 +7,4 @@ $(PLUGIN_ASKRENE_OBJS): $(PLUGIN_ASKRENE_HEADER)
ALL_C_SOURCES += $(PLUGIN_ASKRENE_SRC)
ALL_C_HEADERS += $(PLUGIN_ASKRENE_HEADER)
plugins/cln-askrene: $(PLUGIN_ASKRENE_OBJS) $(PLUGIN_LIB_OBJS) $(PLUGIN_COMMON_OBJS) $(JSMN_OBJS) $(CCAN_OBJS) bitcoin/chainparams.o common/gossmap.o common/sciddir_or_pubkey.o common/gossmods_listpeerchannels.o common/fp16.o common/dijkstra.o common/bolt12.o common/bolt12_merkle.o wire/bolt12_wiregen.o wire/onion_wiregen.o
plugins/cln-askrene: $(PLUGIN_ASKRENE_OBJS) $(PLUGIN_LIB_OBJS) $(PLUGIN_COMMON_OBJS) $(JSMN_OBJS) $(CCAN_OBJS) bitcoin/chainparams.o common/gossmap.o common/sciddir_or_pubkey.o common/gossmods_listpeerchannels.o common/fp16.o common/dijkstra.o common/bolt12.o common/bolt12_merkle.o wire/bolt12_wiregen.o wire/onion_wiregen.o common/route.o

View file

@ -9,6 +9,7 @@
#include "config.h"
#include <ccan/array_size/array_size.h>
#include <ccan/tal/str/str.h>
#include <common/dijkstra.h>
#include <common/gossmap.h>
#include <common/gossmods_listpeerchannels.h>
#include <common/json_param.h>
@ -16,6 +17,7 @@
#include <common/route.h>
#include <errno.h>
#include <plugins/askrene/askrene.h>
#include <plugins/askrene/explain_failure.h>
#include <plugins/askrene/flow.h>
#include <plugins/askrene/layer.h>
#include <plugins/askrene/mcf.h>
@ -223,6 +225,7 @@ struct amount_msat get_additional_per_htlc_cost(const struct route_query *rq,
return AMOUNT_MSAT(0);
}
/* Returns an error message, or sets *routes */
static const char *get_routes(const tal_t *ctx,
struct command *cmd,
@ -327,10 +330,7 @@ static const char *get_routes(const tal_t *ctx,
flows = minflow(rq, rq, srcnode, dstnode, amount,
mu, delay_feefactor, base_fee_penalty, prob_cost_factor);
if (!flows) {
/* FIXME: disjktra here to see if there is any route, and
* diagnose problem (offline peers? Not enough capacity at
* our end? Not enough at theirs?) */
ret = tal_fmt(ctx, "Could not find route");
ret = explain_failure(ctx, rq, srcnode, dstnode, amount);
goto out;
}

View file

@ -0,0 +1,290 @@
#include "config.h"
#include <ccan/tal/str/str.h>
#include <common/dijkstra.h>
#include <common/gossmap.h>
#include <common/route.h>
#include <plugins/askrene/askrene.h>
#include <plugins/askrene/explain_failure.h>
#include <plugins/askrene/layer.h>
#include <plugins/askrene/reserve.h>
#define NO_USABLE_PATHS_STRING "We could not find a usable set of paths."
/* Dijkstra, reduced to ignore anything but connectivity */
static bool always_true(const struct gossmap *map,
const struct gossmap_chan *c,
int dir,
struct amount_msat amount,
void *unused)
{
return true;
}
static u64 route_score_one(struct amount_msat fee UNUSED,
struct amount_msat risk UNUSED,
struct amount_msat total UNUSED,
int dir UNUSED,
const struct gossmap_chan *c UNUSED)
{
return 1;
}
/* This mirrors get_constraints() */
static const char *why_max_constrained(const tal_t *ctx,
const struct route_query *rq,
struct short_channel_id_dir *scidd,
struct amount_msat amount)
{
char *ret = NULL;
const char *reservations;
const struct layer *constrains = NULL;
struct amount_msat max = amount;
/* Figure out the layer that constrains us (most) */
for (size_t i = 0; i < tal_count(rq->layers); i++) {
struct amount_msat min = AMOUNT_MSAT(0), new_max = max;
layer_apply_constraints(rq->layers[i], scidd, &min, &new_max);
if (!amount_msat_eq(new_max, max))
constrains = rq->layers[i];
max = new_max;
}
if (constrains) {
if (!ret)
ret = tal_strdup(ctx, "");
else
tal_append_fmt(&ret, ", ");
tal_append_fmt(&ret, "layer %s says max is %s",
layer_name(constrains),
fmt_amount_msat(tmpctx, max));
}
reservations = fmt_reservations(tmpctx, rq->reserved, scidd);
if (reservations) {
if (!ret)
ret = tal_strdup(ctx, "");
else
tal_append_fmt(&ret, " and ");
tal_append_fmt(&ret, "already reserved %s", reservations);
}
/* If that doesn't explain it, perhaps it violates htlc_max? */
if (!ret) {
struct gossmap_chan *c = gossmap_find_chan(rq->gossmap, &scidd->scid);
fp16_t htlc_max = c->half[scidd->dir].htlc_max;
if (amount_msat_greater_fp16(amount, htlc_max))
ret = tal_fmt(ctx, "exceeds htlc_maximum_msat ~%s",
fmt_amount_msat(tmpctx,
amount_msat(fp16_to_u64(htlc_max))));
}
/* This seems unlikely, but don't return NULL. */
if (!ret)
ret = tal_fmt(ctx, "is constrained");
return ret;
}
struct stat {
size_t num_channels;
struct amount_msat capacity;
};
struct node_stats {
struct stat total, gossip_known, enabled;
};
enum node_direction {
INTO_NODE,
OUT_OF_NODE,
};
static void add_stat(struct stat *stat,
struct amount_msat amount)
{
stat->num_channels++;
if (!amount_msat_accumulate(&stat->capacity, amount))
abort();
}
static void node_stats(const struct route_query *rq,
const struct gossmap_node *node,
enum node_direction node_direction,
struct node_stats *stats)
{
memset(stats, 0, sizeof(*stats));
for (size_t i = 0; i < node->num_chans; i++) {
int dir;
struct gossmap_chan *c;
struct amount_msat cap_msat;
c = gossmap_nth_chan(rq->gossmap, node, i, &dir);
cap_msat = gossmap_chan_get_capacity(rq->gossmap, c);
if (node_direction == INTO_NODE)
dir = !dir;
add_stat(&stats->total, cap_msat);
if (gossmap_chan_set(c, dir))
add_stat(&stats->gossip_known, cap_msat);
if (c->half[dir].enabled)
add_stat(&stats->enabled, cap_msat);
}
}
static const char *check_capacity(const tal_t *ctx,
const struct route_query *rq,
const struct gossmap_node *node,
enum node_direction node_direction,
struct amount_msat amount,
const char *name)
{
struct node_stats stats;
node_stats(rq, node, node_direction, &stats);
if (amount_msat_greater(amount, stats.total.capacity)) {
return tal_fmt(ctx,
NO_USABLE_PATHS_STRING
" Total %s capacity is only %s"
" (in %zu channels).",
name,
fmt_amount_msat(tmpctx, stats.total.capacity),
stats.total.num_channels);
}
if (amount_msat_greater(amount, stats.gossip_known.capacity)) {
return tal_fmt(ctx,
NO_USABLE_PATHS_STRING
" Missing gossip for %s: only known %zu/%zu channels, leaving capacity only %s of %s.",
name,
stats.gossip_known.num_channels,
stats.total.num_channels,
fmt_amount_msat(tmpctx, stats.gossip_known.capacity),
fmt_amount_msat(tmpctx, stats.total.capacity));
}
if (amount_msat_greater(amount, stats.enabled.capacity)) {
return tal_fmt(ctx,
NO_USABLE_PATHS_STRING
" The %s has disabled %zu of %zu channels, leaving capacity only %s of %s.",
name,
stats.total.num_channels - stats.enabled.num_channels,
stats.total.num_channels,
fmt_amount_msat(tmpctx, stats.enabled.capacity),
fmt_amount_msat(tmpctx, stats.total.capacity));
}
return NULL;
}
/* Return description of why scidd is disabled scidd */
static const char *describe_disabled(const tal_t *ctx,
const struct route_query *rq,
const struct short_channel_id_dir *scidd)
{
for (int i = tal_count(rq->layers) - 1; i >= 0; i--) {
if (layer_disables(rq->layers[i], scidd)) {
return tal_fmt(ctx, "marked disabled by layer %s.",
layer_name(rq->layers[i]));
}
}
return tal_fmt(ctx, "marked disabled by gossip message.");
}
static const char *describe_capacity(const tal_t *ctx,
const struct route_query *rq,
const struct short_channel_id_dir *scidd,
struct amount_msat amount)
{
for (int i = tal_count(rq->layers) - 1; i >= 0; i--) {
if (layer_created(rq->layers[i], scidd->scid)) {
return tal_fmt(ctx, " (created by layer %s) isn't big enough to carry %s.",
layer_name(rq->layers[i]),
fmt_amount_msat(tmpctx, amount));
}
}
return tal_fmt(ctx, "isn't big enough to carry %s.",
fmt_amount_msat(tmpctx, amount));
}
/* We failed to find a flow at all. Why? */
const char *explain_failure(const tal_t *ctx,
const struct route_query *rq,
const struct gossmap_node *srcnode,
const struct gossmap_node *dstnode,
struct amount_msat amount)
{
const struct route_hop *hops;
const struct dijkstra *dij;
char *path;
const char *cap_check;
/* Do we have enough funds? */
cap_check = check_capacity(ctx, rq, srcnode, OUT_OF_NODE,
amount, "source");
if (cap_check)
return cap_check;
/* Does destination have enough capacity? */
cap_check = check_capacity(ctx, rq, dstnode, INTO_NODE,
amount, "destination");
if (cap_check)
return cap_check;
/* OK, fall back to telling them why didn't shortest path
* work. This covers the "but I have a direct channel!"
* case. */
dij = dijkstra(tmpctx, rq->gossmap, dstnode, AMOUNT_MSAT(0), 0,
always_true, route_score_one, NULL);
hops = route_from_dijkstra(tmpctx, rq->gossmap, dij, srcnode,
AMOUNT_MSAT(0), 0);
if (!hops)
return tal_fmt(ctx, "There is no connection between source and destination at all");
/* Description of shortest path */
path = tal_strdup(tmpctx, "");
for (size_t i = 0; i < tal_count(hops); i++) {
tal_append_fmt(&path, "%s%s",
i > 0 ? "->" : "",
fmt_short_channel_id(tmpctx, hops[i].scid));
}
/* Now walk through this: is it disabled? Insuff capacity? */
for (size_t i = 0; i < tal_count(hops); i++) {
const char *explanation;
struct short_channel_id_dir scidd;
struct gossmap_chan *c;
struct amount_msat cap_msat;
scidd.scid = hops[i].scid;
scidd.dir = hops[i].direction;
c = gossmap_find_chan(rq->gossmap, &scidd.scid);
cap_msat = gossmap_chan_get_capacity(rq->gossmap, c);
if (!gossmap_chan_set(c, scidd.dir))
explanation = "has no gossip";
else if (!c->half[scidd.dir].enabled)
explanation = describe_disabled(tmpctx, rq, &scidd);
else if (amount_msat_greater(amount, cap_msat))
explanation = describe_capacity(tmpctx, rq, &scidd, amount);
else {
struct amount_msat min, max;
get_constraints(rq, c, scidd.dir, &min, &max);
if (amount_msat_less(max, amount)) {
explanation = why_max_constrained(tmpctx, rq,
&scidd, amount);
} else
continue;
}
return tal_fmt(ctx,
NO_USABLE_PATHS_STRING
" The shortest path is %s, but %s %s",
path,
fmt_short_channel_id_dir(tmpctx, &scidd),
explanation);
}
return tal_fmt(ctx,
"Actually, I'm not sure why we didn't find the"
" obvious route %s: perhaps this is a bug?",
path);
}

View file

@ -0,0 +1,16 @@
#ifndef LIGHTNING_PLUGINS_ASKRENE_EXPLAIN_FAILURE_H
#define LIGHTNING_PLUGINS_ASKRENE_EXPLAIN_FAILURE_H
#include "config.h"
#include <common/amount.h>
struct route_query;
struct gossmap_node;
/* When MCF returns nothing, try to explain why */
const char *explain_failure(const tal_t *ctx,
const struct route_query *rq,
const struct gossmap_node *srcnode,
const struct gossmap_node *dstnode,
struct amount_msat amount);
#endif /* LIGHTNING_PLUGINS_ASKRENE_EXPLAIN_FAILURE_H */

View file

@ -502,6 +502,21 @@ void json_add_layers(struct json_stream *js,
json_array_end(js);
}
bool layer_created(const struct layer *layer, struct short_channel_id scid)
{
return local_channel_hash_get(layer->local_channels, scid);
}
bool layer_disables(const struct layer *layer,
const struct short_channel_id_dir *scidd)
{
const struct local_update *lu;
lu = local_update_hash_get(layer->local_updates, scidd);
return (lu && lu->enabled && *lu->enabled == false);
}
void layer_memleak_mark(struct askrene *askrene, struct htable *memtable)
{
struct layer *l;

View file

@ -102,6 +102,12 @@ void json_add_constraint(struct json_stream *js,
const struct constraint *c,
const struct layer *layer);
/* For explain_failure: did this layer create this scid? */
bool layer_created(const struct layer *layer, struct short_channel_id scid);
/* For explain_failure: did this layer disable this channel? */
bool layer_disables(const struct layer *layer, const struct short_channel_id_dir *scidd);
/* Scan for memleaks */
void layer_memleak_mark(struct askrene *askrene, struct htable *memtable);
#endif /* LIGHTNING_PLUGINS_ASKRENE_LAYER_H */

View file

@ -151,6 +151,32 @@ void json_add_reservations(struct json_stream *js,
json_array_end(js);
}
const char *fmt_reservations(const tal_t *ctx,
const struct reserve_htable *reserved,
const struct short_channel_id_dir *scidd)
{
struct reserve *r;
struct reserve_htable_iter rit;
char *ret = NULL;
for (r = reserve_htable_getfirst(reserved, scidd, &rit);
r;
r = reserve_htable_getnext(reserved, scidd, &rit)) {
u64 seconds;
if (!ret)
ret = tal_strdup(ctx, "");
else
tal_append_fmt(&ret, ", ");
tal_append_fmt(&ret, "%s by command %s",
fmt_amount_msat(tmpctx, r->rhop.amount), r->cmd_id);
seconds = timemono_between(time_mono(), r->timestamp).ts.tv_sec;
/* Add a note if it's old */
if (seconds > 0)
tal_append_fmt(&ret, " (%"PRIu64" seconds ago)", seconds);
}
return ret;
}
void reserve_memleak_mark(struct askrene *askrene, struct htable *memtable)
{
memleak_scan_htable(memtable, &askrene->reserved->raw);

View file

@ -40,6 +40,11 @@ bool reserve_accumulate(const struct reserve_htable *reserved,
const struct short_channel_id_dir *scidd,
struct amount_msat *amount);
/* To explain why we couldn't route */
const char *fmt_reservations(const tal_t *ctx,
const struct reserve_htable *reserved,
const struct short_channel_id_dir *scidd);
/* Print out a json object for all reservations */
void json_add_reservations(struct json_stream *js,
const struct reserve_htable *reserved,

View file

@ -20,8 +20,10 @@ def test_reserve(node_factory):
l1, l2, l3 = node_factory.line_graph(3, wait_for_announce=True)
assert l1.rpc.askrene_listreservations() == {'reservations': []}
scid12dir = f"{first_scid(l1, l2)}/{direction(l1.info['id'], l2.info['id'])}"
scid23dir = f"{first_scid(l2, l3)}/{direction(l2.info['id'], l3.info['id'])}"
scid12 = first_scid(l1, l2)
scid23 = first_scid(l2, l3)
scid12dir = f"{scid12}/{direction(l1.info['id'], l2.info['id'])}"
scid23dir = f"{scid23}/{direction(l2.info['id'], l3.info['id'])}"
initial_prob = l1.rpc.getroutes(source=l1.info['id'],
destination=l3.info['id'],
@ -60,8 +62,12 @@ def test_reserve(node_factory):
{'short_channel_id_dir': scid23dir,
'amount_msat': 1000_000_000_000}])
# FIXME: better error!
with pytest.raises(RpcError, match="Could not find route"):
# Keep it consistent: the below will mention a time if >= 1 seconds old,
# which might happen without the sleep on slow machines.
time.sleep(2)
# Reservations can be in either order.
with pytest.raises(RpcError, match=rf'We could not find a usable set of paths. The shortest path is {scid12}->{scid23}, but {scid12dir} already reserved 10000000*msat by command ".*" \([0-9]* seconds ago\), 10000000*msat by command ".*" \([0-9]* seconds ago\)'):
l1.rpc.getroutes(source=l1.info['id'],
destination=l3.info['id'],
amount_msat=1000000,
@ -305,18 +311,46 @@ def test_getroutes(node_factory):
# Set up l1 with this as the gossip_store
l1 = node_factory.get_node(gossip_store_file=gsfile.name)
# Too much should give a decent explanation.
with pytest.raises(RpcError, match=r"We could not find a usable set of paths\. The shortest path is 0x1x0, but 0x1x0/1 isn't big enough to carry 1000000001msat\."):
l1.rpc.getroutes(source=nodemap[0],
destination=nodemap[1],
amount_msat=1000000001,
layers=[],
maxfee_msat=100000000,
final_cltv=99)
# This should tell us source doesn't have enough.
with pytest.raises(RpcError, match=r"We could not find a usable set of paths\. Total source capacity is only 1019000000msat \(in 3 channels\)\."):
l1.rpc.getroutes(source=nodemap[0],
destination=nodemap[1],
amount_msat=2000000001,
layers=[],
maxfee_msat=20000000,
final_cltv=99)
# This should tell us dest doesn't have enough.
with pytest.raises(RpcError, match=r"We could not find a usable set of paths\. Total destination capacity is only 1000000000msat \(in 1 channels\)\."):
l1.rpc.getroutes(source=nodemap[0],
destination=nodemap[4],
amount_msat=1000000001,
layers=[],
maxfee_msat=30000000,
final_cltv=99)
# Disabling channels makes getroutes fail
l1.rpc.askrene_create_layer('chans_disabled')
l1.rpc.askrene_update_channel(layer="chans_disabled",
short_channel_id_dir='0x1x0/1',
enabled=False)
with pytest.raises(RpcError, match="Could not find route"):
with pytest.raises(RpcError, match=r"We could not find a usable set of paths\. The shortest path is 0x1x0, but 0x1x0/1 marked disabled by layer chans_disabled\."):
l1.rpc.getroutes(source=nodemap[0],
destination=nodemap[1],
amount_msat=1000,
layers=["chans_disabled"],
maxfee_msat=1000,
final_cltv=99)
# Start easy
assert l1.rpc.getroutes(source=nodemap[0],
destination=nodemap[1],