pay: handle returned errors more gracefully.

The code had incorrect assertions, partially because it didn't clearly distinguish errors from the final node (which, barring blockheight issues, mean a complete failre) and intermediate nodes. In particular, we can't trust the *values*, so we need to distinguish these by the *sender*. If a route is of length 2 (A, B): - erring_index == 0 means us complaining about channel A. - erring_index == 1 means A.node complaining about channel B. - erring_index == 2 means the final destination node B.node. This is particularly of note because Travis does NOT run test_pay_routeboost! Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
2025-01-18 13:25:43 +01:00 · 2020-07-20 15:17:55 +09:30 · 2020-07-20 15:17:55 +09:30 · 178415aca7
commit 178415aca7
parent 6951a9ea42
1 changed files with 262 additions and 91 deletions
--- a/plugins/libplugin-pay.c
+++ b/plugins/libplugin-pay.c
@ -592,7 +592,8 @@ fail:
 }

 static void channel_hints_update(struct payment *root,
-				 struct short_channel_id *scid, int direction,
+				 const struct short_channel_id *scid,
+				 int direction,
 				 bool enabled,
 				 struct amount_msat estimated_capacity)
 {
@ -660,12 +661,255 @@ static void payment_result_infer(struct route_hop *route,
 		r->erring_direction = &route[i].direction;
 }

+/* If a node takes too much fee or cltv, the next one reports it.  We don't
+ * know who to believe, but log it */
+static void report_tampering(struct payment *p,
+			     size_t report_pos,
+			     const char *style)
+{
+	const struct node_id *id = &p->route[report_pos].nodeid;
+
+	if (report_pos == 0) {
+		plugin_log(p->plugin, LOG_UNUSUAL,
+			   "Node #%zu (%s) claimed we sent them invalid %s",
+			   report_pos + 1,
+			   type_to_string(tmpctx, struct node_id, id),
+			   style);
+	} else {
+		plugin_log(p->plugin, LOG_UNUSUAL,
+			   "Node #%zu (%s) claimed #%zu (%s) sent them invalid %s",
+			   report_pos + 1,
+			   type_to_string(tmpctx, struct node_id, id),
+			   report_pos,
+			   type_to_string(tmpctx, struct node_id,
+					  &p->route[report_pos-1].nodeid),
+			   style);
+	}
+}
+
+static struct command_result *
+handle_final_failure(struct command *cmd,
+		     struct payment *p,
+		     const struct node_id *final_id,
+		     enum onion_type failcode)
+{
+	/* We use an exhaustive switch statement here so you get a compile
+	 * warning when new ones are added, and can think about where they go */
+	switch (failcode) {
+	case WIRE_FINAL_INCORRECT_CLTV_EXPIRY:
+		report_tampering(p, tal_count(p->route)-1, "cltv");
+		goto error;
+	case WIRE_FINAL_INCORRECT_HTLC_AMOUNT:
+		report_tampering(p, tal_count(p->route)-1, "amount");
+		goto error;
+
+	/* BOLT #4:
+	 *
+	 * A _forwarding node_ MAY, but a _final node_ MUST NOT:
+	 *...
+	 *     - return an `invalid_onion_version` error.
+	 *...
+	 *     - return an `invalid_onion_hmac` error.
+	 *...
+	 *     - return an `invalid_onion_key` error.
+	 *...
+	 *     - return a `temporary_channel_failure` error.
+	 *...
+	 *     - return a `permanent_channel_failure` error.
+	 *...
+	 *     - return a `required_channel_feature_missing` error.
+	 *...
+	 *     - return an `unknown_next_peer` error.
+	 *...
+	 *     - return an `amount_below_minimum` error.
+	 *...
+	 *     - return a `fee_insufficient` error.
+	 *...
+	 *     - return an `incorrect_cltv_expiry` error.
+	 *...
+	 *     - return an `expiry_too_soon` error.
+	 *...
+	 *     - return an `expiry_too_far` error.
+	 *...
+	 *     - return a `channel_disabled` error.
+	 */
+	case WIRE_INVALID_ONION_VERSION:
+	case WIRE_INVALID_ONION_HMAC:
+	case WIRE_INVALID_ONION_KEY:
+	case WIRE_TEMPORARY_CHANNEL_FAILURE:
+	case WIRE_PERMANENT_CHANNEL_FAILURE:
+	case WIRE_REQUIRED_CHANNEL_FEATURE_MISSING:
+	case WIRE_UNKNOWN_NEXT_PEER:
+	case WIRE_AMOUNT_BELOW_MINIMUM:
+	case WIRE_FEE_INSUFFICIENT:
+	case WIRE_INCORRECT_CLTV_EXPIRY:
+	case WIRE_EXPIRY_TOO_FAR:
+	case WIRE_EXPIRY_TOO_SOON:
+	case WIRE_CHANNEL_DISABLED:
+		goto strange_error;
+
+	case WIRE_INVALID_ONION_PAYLOAD:
+	case WIRE_INVALID_REALM:
+	case WIRE_PERMANENT_NODE_FAILURE:
+	case WIRE_TEMPORARY_NODE_FAILURE:
+	case WIRE_REQUIRED_NODE_FEATURE_MISSING:
+#if EXPERIMENTAL_FEATURES
+	case WIRE_INVALID_ONION_BLINDING:
+#endif
+ 	case WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS:
+	case WIRE_MPP_TIMEOUT:
+		goto error;
+	}
+
+strange_error:
+	plugin_log(p->plugin, LOG_UNUSUAL,
+		   "Final node %s reported strange error code %u",
+		   type_to_string(tmpctx, struct node_id, final_id),
+		   failcode);
+
+error:
+	p->result->code = PAY_DESTINATION_PERM_FAIL;
+	payment_root(p)->abort = true;
+
+	payment_fail(p, "%s", p->result->message);
+	return command_still_pending(cmd);
+}
+
+
+static struct command_result *
+handle_intermediate_failure(struct command *cmd,
+			    struct payment *p,
+			    const struct node_id *errnode,
+			    const struct route_hop *errchan,
+			    enum onion_type failcode)
+{
+	struct payment *root = payment_root(p);
+
+	/* We use an exhaustive switch statement here so you get a compile
+	 * warning when new ones are added, and can think about where they go */
+	switch (failcode) {
+	/* BOLT #4:
+	 *
+	 * An _intermediate hop_ MUST NOT, but the _final node_:
+	 *...
+	 *     - MUST return an `incorrect_or_unknown_payment_details` error.
+	 *...
+	 *     - MUST return `final_incorrect_cltv_expiry` error.
+	 *...
+	 *     - MUST return a `final_incorrect_htlc_amount` error.
+	 */
+ 	case WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS:
+ 	case WIRE_FINAL_INCORRECT_CLTV_EXPIRY:
+ 	case WIRE_FINAL_INCORRECT_HTLC_AMOUNT:
+	/* FIXME: Document in BOLT that intermediates must not return this! */
+	case WIRE_MPP_TIMEOUT:
+		goto strange_error;
+
+	case WIRE_PERMANENT_CHANNEL_FAILURE:
+	case WIRE_CHANNEL_DISABLED:
+	case WIRE_UNKNOWN_NEXT_PEER:
+	case WIRE_REQUIRED_CHANNEL_FEATURE_MISSING:
+		/* All of these result in the channel being marked as disabled. */
+		channel_hints_update(root,
+				     &errchan->channel_id, errchan->direction,
+				     false, AMOUNT_MSAT(0));
+		break;
+
+	case WIRE_TEMPORARY_CHANNEL_FAILURE: {
+		/* These are an indication that the capacity was insufficient,
+		 * remember the amount we tried as an estimate. */
+		struct amount_msat est = errchan->amount;
+		est.millisatoshis *= 0.75; /* Raw: Multiplication */
+		channel_hints_update(root,
+				     &errchan->channel_id, errchan->direction,
+				     true, est);
+		goto error;
+	}
+
+	case WIRE_INCORRECT_CLTV_EXPIRY:
+		report_tampering(p, errchan - p->route, "cltv");
+		goto error;
+
+	case WIRE_INVALID_ONION_VERSION:
+	case WIRE_INVALID_ONION_HMAC:
+	case WIRE_INVALID_ONION_KEY:
+	case WIRE_PERMANENT_NODE_FAILURE:
+	case WIRE_TEMPORARY_NODE_FAILURE:
+	case WIRE_REQUIRED_NODE_FEATURE_MISSING:
+	case WIRE_INVALID_ONION_PAYLOAD:
+	case WIRE_INVALID_REALM:
+#if EXPERIMENTAL_FEATURES
+	case WIRE_INVALID_ONION_BLINDING:
+#endif
+		tal_arr_expand(&root->excluded_nodes, *errnode);
+		goto error;
+
+	case WIRE_AMOUNT_BELOW_MINIMUM:
+	case WIRE_FEE_INSUFFICIENT:
+	case WIRE_EXPIRY_TOO_FAR:
+	case WIRE_EXPIRY_TOO_SOON:
+		goto error;
+	}
+
+strange_error:
+	plugin_log(p->plugin, LOG_UNUSUAL,
+		   "Intermediate node %s reported strange error code %u",
+		   type_to_string(tmpctx, struct node_id, errnode),
+		   failcode);
+
+error:
+	payment_fail(p, "%s", p->result->message);
+	return command_still_pending(cmd);
+}
+
+/* From the docs:
+ *
+ * - *erring_index*: The index of the node along the route that
+ *   reported the error. 0 for the local node, 1 for the first hop,
+ *   and so on.
+ *
+ * The only difficulty is mapping the erring_index to the correct hop.
+ * We split into the erring node, and the error channel, since they're
+ * used in different contexts. NULL error_channel means it's the final
+ * node, whose errors are treated differently.
+ */
+static bool assign_blame(const struct payment *p,
+			 const struct node_id **errnode,
+			 const struct route_hop **errchan)
+{
+	int index;
+
+	if (p->result->erring_index == NULL)
+		return false;
+
+	index = *p->result->erring_index;
+
+	/* BADONION errors are reported on behalf of the next node. */
+	if (p->result->failcode & BADONION)
+		index++;
+
+	/* Final node *shouldn't* report BADONION, but don't assume. */
+	if (index >= tal_count(p->route)) {
+		*errchan = NULL;
+		*errnode = &p->route[tal_count(p->route) - 1].nodeid;
+		return true;
+	}
+
+	*errchan = &p->route[index];
+	if (index == 0)
+		*errnode = p->local_id;
+	else
+		*errnode = &p->route[index - 1].nodeid;
+	return true;
+}
+
 static struct command_result *
 payment_waitsendpay_finished(struct command *cmd, const char *buffer,
 			     const jsmntok_t *toks, struct payment *p)
 {
-	struct payment *root;
-	struct route_hop *hop;
+	const struct node_id *errnode;
+	const struct route_hop *errchan;
+
 	assert(p->route != NULL);

 	p->end_time = time_now();
@ -689,98 +933,25 @@ payment_waitsendpay_finished(struct command *cmd, const char *buffer,
 		return command_still_pending(cmd);
 	}

-	root = payment_root(p);
 	payment_chanhints_apply_route(p, true);

-	/* Either we have no erring_index, or it must be a correct index into
-	 * p->route. From the docs:
-	 *
-	 * - *erring_index*: The index of the node along the route that
-	 *   reported the error. 0 for the local node, 1 for the first hop,
-	 *   and so on.
-         *
-         * The only difficulty is mapping the erring_index to the correct hop,
-         * since the hop consists of the processing node, but the payload for
-         * the next hop. In addition there is a class of onion-related errors
-         * that are reported by the previous hop to the one erring, since the
-         * erring node couldn't read the onion in the first place.
-	 */
-	assert(p->result->erring_index == NULL ||
-	       *p->result->erring_index - 1 < tal_count(p->route));
-
-	switch (p->result->failcode) {
-	case WIRE_PERMANENT_CHANNEL_FAILURE:
-	case WIRE_CHANNEL_DISABLED:
-	case WIRE_UNKNOWN_NEXT_PEER:
-	case WIRE_REQUIRED_CHANNEL_FEATURE_MISSING:
-		/* All of these result in the channel being marked as disabled. */
-		assert(*p->result->erring_index < tal_count(p->route));
-		hop = &p->route[*p->result->erring_index];
-		channel_hints_update(root, &hop->channel_id, hop->direction,
-				     false, AMOUNT_MSAT(0));
-		break;
-	case WIRE_TEMPORARY_CHANNEL_FAILURE:
-		/* These are an indication that the capacity was insufficient,
-		 * remember the amount we tried as an estimate. */
-		assert(*p->result->erring_index < tal_count(p->route));
-		hop = &p->route[*p->result->erring_index];
-		struct amount_msat est = {
-			.millisatoshis = hop->amount.millisatoshis * 0.75}; /* Raw: Multiplication */
-		channel_hints_update(root, &hop->channel_id, hop->direction,
-				     true, est);
-		break;
-
-	case WIRE_INVALID_ONION_PAYLOAD:
-	case WIRE_INVALID_REALM:
-	case WIRE_PERMANENT_NODE_FAILURE:
-	case WIRE_TEMPORARY_NODE_FAILURE:
-	case WIRE_REQUIRED_NODE_FEATURE_MISSING:
-	case WIRE_INVALID_ONION_VERSION:
-	case WIRE_INVALID_ONION_HMAC:
-	case WIRE_INVALID_ONION_KEY:
-#if EXPERIMENTAL_FEATURES
-	case WIRE_INVALID_ONION_BLINDING:
-#endif
-		/* These are reported by the last hop, i.e., the destination of hop i-1. */
-		assert(*p->result->erring_index - 1 < tal_count(p->route));
-		hop = &p->route[*p->result->erring_index - 1];
-		tal_arr_expand(&root->excluded_nodes, hop->nodeid);
-		break;
-
- 	case WIRE_INCORRECT_OR_UNKNOWN_PAYMENT_DETAILS:
-		p->result->code = PAY_DESTINATION_PERM_FAIL;
-		root->abort = true;
-	case WIRE_MPP_TIMEOUT:
-		/* These are permanent failures that should abort all of our
-		 * attempts right away. We'll still track pending partial
-		 * payments correctly, just not start new ones. */
-		root->abort = true;
-		break;
-
-	case WIRE_AMOUNT_BELOW_MINIMUM:
-	case WIRE_EXPIRY_TOO_FAR:
-	case WIRE_EXPIRY_TOO_SOON:
-	case WIRE_FEE_INSUFFICIENT:
-	case WIRE_INCORRECT_CLTV_EXPIRY:
-	case WIRE_FINAL_INCORRECT_CLTV_EXPIRY:
-		/* These are issues that are due to gossipd being out of date,
-		 * we ignore them here, and wait for gossipd to adjust
-		 * instead. */
-		break;
-	case WIRE_FINAL_INCORRECT_HTLC_AMOUNT:
-		/* These are symptoms of intermediate hops tampering with the
-		 * payment. */
-		hop = &p->route[*p->result->erring_index];
-		plugin_log(
-		    p->plugin, LOG_UNUSUAL,
-		    "Node %s reported an incorrect HTLC amount, this could be "
-		    "a prior hop messing with the amounts.",
-		    type_to_string(tmpctx, struct node_id, &hop->nodeid));
-		break;
+	if (!assign_blame(p, &errnode, &errchan)) {
+		plugin_log(p->plugin, LOG_UNUSUAL,
+			   "No erring_index set in `waitsendpay` result: %.*s",
+			   json_tok_full_len(toks),
+			   json_tok_full(buffer, toks));
+		/* FIXME: Pick a random channel to fail? */
+		payment_set_step(p, PAYMENT_STEP_FAILED);
+		payment_continue(p);
+		return command_still_pending(cmd);
 	}

-	payment_fail(p, "%s", p->result->message);
-	return command_still_pending(cmd);
+	if (!errchan)
+		return handle_final_failure(cmd, p, errnode,
+					    p->result->failcode);
+
+	return handle_intermediate_failure(cmd, p, errnode, errchan,
+					   p->result->failcode);
 }

 static struct command_result *payment_sendonion_success(struct command *cmd,