peer_fail: differentiate transient and permanent failures.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
Rusty Russell 2017-06-20 15:47:03 +09:30
parent 66d55334b0
commit ffa5e1c52c
4 changed files with 156 additions and 71 deletions

View File

@ -38,8 +38,7 @@ static void peer_bad_message(struct subd *gossip, const u8 *msg)
log_debug(gossip->log, "Peer %s gave bad msg %s",
type_to_string(msg, struct pubkey, &peer->id),
tal_hex(msg, msg));
peer_fail(peer, "Bad message %s during gossip phase",
gossip_wire_type_name(fromwire_peektype(msg)));
peer_fail_permanent(peer, msg);
}
static void peer_failed(struct subd *gossip, const u8 *msg)
@ -57,10 +56,7 @@ static void peer_failed(struct subd *gossip, const u8 *msg)
if (!peer)
fatal("Gossip gave bad peerid %"PRIu64, unique_id);
log_unusual(gossip->log, "Peer %s failed: %.*s",
type_to_string(msg, struct pubkey, &peer->id),
(int)tal_len(err), (const char *)err);
peer_fail(peer, "Error during gossip phase");
peer_fail_permanent(peer, msg);
}
static void peer_nongossip(struct subd *gossip, const u8 *msg,

View File

@ -84,12 +84,58 @@ static void peer_reconnect(struct peer *peer)
try_reconnect, peer);
}
void peer_fail(struct peer *peer, const char *fmt, ...)
static void drop_to_chain(struct peer *peer)
{
/* FIXME: Implement. */
}
void peer_fail_permanent(struct peer *peer, const u8 *msg)
{
/* BOLT #1:
*
* The channel is referred to by `channel_id` unless `channel_id` is
* zero (ie. all bytes zero), in which case it refers to all
* channels. */
static const struct channel_id all_channels;
log_unusual(peer->log, "Peer permanent failure in %s: %.*s",
peer_state_name(peer->state),
(int)tal_len(msg), (char *)msg);
peer->error = towire_error(peer, &all_channels, msg);
/* In case we reconnected in the meantime. */
if (peer->fd != -1) {
/* FIXME: We should retransmit error if this happens. */
close(peer->fd);
}
if (peer_persists(peer))
drop_to_chain(peer);
else
tal_free(peer);
return;
}
void peer_internal_error(struct peer *peer, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
log_info(peer->log, "Peer failure in %s: ",
log_broken(peer->log, "Peer internal error %s: ",
peer_state_name(peer->state));
logv_add(peer->log, fmt, ap);
va_end(ap);
peer_fail_permanent(peer,
take((u8 *)tal_strdup(peer, "Internal error")));
}
void peer_fail_transient(struct peer *peer, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
log_info(peer->log, "Peer transient failure in %s: ",
peer_state_name(peer->state));
logv_add(peer->log, fmt, ap);
va_end(ap);
@ -133,6 +179,15 @@ static bool peer_start_channeld_hsmfd(struct subd *hsm, const u8 *resp,
const int *fds,
struct peer *peer);
/* Send (encrypted) error message, then close. */
static struct io_plan *send_error(struct io_conn *conn, struct peer *peer)
{
struct peer_crypto_state *pcs = tal(conn, struct peer_crypto_state);
init_peer_crypto_state(peer, pcs);
pcs->cs = *peer->cs;
return peer_write_message(conn, pcs, peer->error, (void *)io_close_cb);
}
/* Returns true if we consider this a reconnection. */
static bool peer_reconnected(struct lightningd *ld,
const struct pubkey *id,
@ -150,6 +205,16 @@ static bool peer_reconnected(struct lightningd *ld,
tal_free(peer->cs);
peer->cs = tal_dup(peer, struct crypto_state, cs);
/* BOLT #2:
*
* On reconnection, if a channel is in an error state, the node SHOULD
* retransmit the error packet and ignore any other packets for that
* channel, and the following requirements do not apply. */
if (peer->error) {
io_new_conn(peer, fd, send_error, peer);
return true;
}
/* We need this for init */
peer->reconnected = true;
@ -282,6 +347,7 @@ void add_peer(struct lightningd *ld, u64 unique_id,
/* Fresh peer. */
peer = tal(ld, struct peer);
peer->ld = ld;
peer->error = NULL;
peer->unique_id = unique_id;
peer->owner = NULL;
peer->id = *id;
@ -370,8 +436,11 @@ static void peer_owner_finished(struct subd *subd, int status)
}
subd->peer->owner = NULL;
peer_fail(subd->peer, "Owning subdaemon %s died (%i)",
subd->name, status);
/* Don't do a transient error if it's already perm failed. */
if (!subd->peer->error)
peer_fail_transient(subd->peer, "Owning subdaemon %s died (%i)",
subd->name, status);
}
static int make_listen_fd(struct lightningd *ld,
@ -946,7 +1015,7 @@ static bool peer_start_channeld_hsmfd(struct subd *hsm, const u8 *resp,
if (!peer->owner) {
log_unusual(peer->log, "Could not subdaemon channel: %s",
strerror(errno));
peer_fail(peer, "Failed to subdaemon channel");
peer_fail_transient(peer, "Failed to subdaemon channel");
return true;
}
@ -1098,21 +1167,24 @@ static bool opening_funder_finished(struct subd *opening, const u8 *resp,
bitcoin_txid(fc->funding_tx, fc->peer->funding_txid);
if (!structeq(fc->peer->funding_txid, &funding_txid)) {
peer_fail(fc->peer, "Funding txid mismatch:"
" satoshi %"PRIu64" change %"PRIu64" changeidx %u"
" localkey %s remotekey %s",
fc->peer->funding_satoshi,
fc->change, fc->change_keyindex,
type_to_string(fc, struct pubkey, &local_fundingkey),
type_to_string(fc, struct pubkey,
&channel_info->remote_fundingkey));
peer_internal_error(fc->peer,
"Funding txid mismatch:"
" satoshi %"PRIu64" change %"PRIu64
" changeidx %u"
" localkey %s remotekey %s",
fc->peer->funding_satoshi,
fc->change, fc->change_keyindex,
type_to_string(fc, struct pubkey,
&local_fundingkey),
type_to_string(fc, struct pubkey,
&channel_info->remote_fundingkey));
return false;
}
/* We should have sent and received the first commitsig */
if (!peer_save_commitsig_received(fc->peer, 0)
|| !peer_save_commitsig_sent(fc->peer, 0)) {
peer_fail(fc->peer, "Saving commitsig failed");
peer_internal_error(fc->peer, "Saving commitsig failed");
return false;
}
@ -1246,10 +1318,12 @@ void peer_fundee_open(struct peer *peer, const u8 *from_peer)
/* Note: gossipd handles unknown packets, so we don't have to worry
* about ignoring odd ones here. */
if (fromwire_peektype(from_peer) != WIRE_OPEN_CHANNEL) {
char *msg = tal_fmt(peer, "Bad message %i (%s) before opening",
fromwire_peektype(from_peer),
wire_type_name(fromwire_peektype(from_peer)));
log_unusual(peer->log, "Strange message to exit gossip: %u",
fromwire_peektype(from_peer));
peer_fail(peer, "Bad message during gossiping: %s",
tal_hex(peer, from_peer));
peer_fail_permanent(peer, (u8 *)take(msg));
return;
}
@ -1260,8 +1334,8 @@ void peer_fundee_open(struct peer *peer, const u8 *from_peer)
take(&peer->fd), &peer->gossip_client_fd,
NULL);
if (!peer->owner) {
peer_fail(peer, "Failed to subdaemon opening: %s",
strerror(errno));
peer_fail_transient(peer, "Failed to subdaemon opening: %s",
strerror(errno));
return;
}
@ -1294,7 +1368,8 @@ void peer_fundee_open(struct peer *peer, const u8 *from_peer)
/* Careful here! Their message could push us overlength! */
if (tal_len(msg) >= 65536) {
peer_fail(peer, "Unacceptably long open_channel");
char *err = tal_strdup(peer, "Unacceptably long open_channel");
peer_fail_permanent(peer, (u8 *)take(err));
return;
}
subd_req(peer, peer->owner, take(msg), -1, 1,
@ -1343,8 +1418,8 @@ static bool gossip_peer_released(struct subd *gossip,
take(&fc->peer->fd),
&fc->peer->gossip_client_fd, NULL);
if (!opening) {
peer_fail(fc->peer, "Failed to subdaemon opening: %s",
strerror(errno));
peer_fail_transient(fc->peer, "Failed to subdaemon opening: %s",
strerror(errno));
return true;
}
fc->peer->owner = opening;

View File

@ -24,6 +24,9 @@ struct peer {
/* ID of peer */
struct pubkey id;
/* Error message (iff in error state) */
u8 *error;
/* Their shachain. */
struct shachain their_shachain;
@ -140,8 +143,13 @@ void peer_fundee_open(struct peer *peer, const u8 *msg);
void add_peer(struct lightningd *ld, u64 unique_id,
int fd, const struct pubkey *id,
const struct crypto_state *cs);
/* Peer has failed. */
PRINTF_FMT(2,3) void peer_fail(struct peer *peer, const char *fmt, ...);
/* Peer has failed, but try reconnected. */
PRINTF_FMT(2,3) void peer_fail_transient(struct peer *peer, const char *fmt,...);
/* Peer has failed, give up on it. */
void peer_fail_permanent(struct peer *peer, const u8 *msg);
/* Permanent error, but due to internal problems, not peer. */
void peer_internal_error(struct peer *peer, const char *fmt, ...);
const char *peer_state_name(enum peer_state state);
void peer_set_condition(struct peer *peer, enum peer_state oldstate,

View File

@ -1,5 +1,6 @@
#include <ccan/build_assert/build_assert.h>
#include <ccan/mem/mem.h>
#include <ccan/tal/str/str.h>
#include <daemon/chaintopology.h>
#include <daemon/invoice.h>
#include <daemon/log.h>
@ -334,7 +335,7 @@ static bool rcvd_htlc_reply(struct subd *subd, const u8 *msg, const int *fds,
&hout->key.id,
&failure_code,
&failurestr)) {
log_broken(subd->log, "Bad channel_offer_htlc_reply");
peer_internal_error(subd->peer, "Bad channel_offer_htlc_reply");
tal_free(hout);
return false;
}
@ -345,9 +346,10 @@ static bool rcvd_htlc_reply(struct subd *subd, const u8 *msg, const int *fds,
}
if (find_htlc_out(&subd->ld->htlcs_out, hout->key.peer, hout->key.id)) {
log_broken(subd->log, "Bad offer_htlc_reply HTLC id %"PRIu64
" is a duplicate",
hout->key.id);
peer_internal_error(subd->peer,
"Bad offer_htlc_reply HTLC id %"PRIu64
" is a duplicate",
hout->key.id);
tal_free(hout);
return false;
}
@ -530,10 +532,11 @@ static bool state_update_ok(struct peer *peer,
expected = SENT_REMOVE_COMMIT;
if (newstate != expected) {
log_broken(peer->log, "HTLC %s %"PRIu64" invalid update %s->%s",
dir, htlc_id,
htlc_state_name(oldstate),
htlc_state_name(newstate));
peer_internal_error(peer,
"HTLC %s %"PRIu64" invalid update %s->%s",
dir, htlc_id,
htlc_state_name(oldstate),
htlc_state_name(newstate));
return false;
}
@ -582,8 +585,8 @@ static bool peer_accepted_htlc(struct peer *peer,
hin = find_htlc_in(&peer->ld->htlcs_in, peer, id);
if (!hin) {
log_broken(peer->log,
"peer_got_revoke unknown htlc %"PRIu64, id);
peer_internal_error(peer,
"peer_got_revoke unknown htlc %"PRIu64, id);
return false;
}
@ -595,7 +598,7 @@ static bool peer_accepted_htlc(struct peer *peer,
sizeof(hin->onion_routing_packet));
if (!op) {
if (!memeqzero(&hin->shared_secret, sizeof(hin->shared_secret))){
log_broken(peer->log,
peer_internal_error(peer,
"bad onion in got_revoke: %s",
tal_hexstr(peer, hin->onion_routing_packet,
sizeof(hin->onion_routing_packet)));
@ -665,9 +668,9 @@ static bool peer_fulfilled_our_htlc(struct peer *peer,
hout = find_htlc_out(&peer->ld->htlcs_out, peer, fulfilled->id);
if (!hout) {
log_broken(peer->log,
"fulfilled_our_htlc unknown htlc %"PRIu64,
fulfilled->id);
peer_internal_error(peer,
"fulfilled_our_htlc unknown htlc %"PRIu64,
fulfilled->id);
return false;
}
@ -694,9 +697,9 @@ static bool peer_failed_our_htlc(struct peer *peer,
hout = find_htlc_out(&peer->ld->htlcs_out, peer, failed->id);
if (!hout) {
log_broken(peer->log,
"failed_our_htlc unknown htlc %"PRIu64,
failed->id);
peer_internal_error(peer,
"failed_our_htlc unknown htlc %"PRIu64,
failed->id);
return false;
}
@ -763,7 +766,7 @@ static bool update_in_htlc(struct peer *peer, u64 id, enum htlc_state newstate)
hin = find_htlc_in(&peer->ld->htlcs_in, peer, id);
if (!hin) {
log_broken(peer->log, "Can't find in HTLC %"PRIu64, id);
peer_internal_error(peer, "Can't find in HTLC %"PRIu64, id);
return false;
}
@ -782,7 +785,7 @@ static bool update_out_htlc(struct peer *peer, u64 id, enum htlc_state newstate)
hout = find_htlc_out(&peer->ld->htlcs_out, peer, id);
if (!hout) {
log_broken(peer->log, "Can't find out HTLC %"PRIu64, id);
peer_internal_error(peer, "Can't find out HTLC %"PRIu64, id);
return false;
}
@ -825,14 +828,14 @@ int peer_sending_commitsig(struct peer *peer, const u8 *msg)
&commitnum,
&changed_htlcs,
&commit_sig, &htlc_sigs)) {
log_broken(peer->log, "bad channel_sending_commitsig %s",
tal_hex(peer, msg));
peer_internal_error(peer, "bad channel_sending_commitsig %s",
tal_hex(peer, msg));
return -1;
}
for (i = 0; i < tal_count(changed_htlcs); i++) {
if (!changed_htlc(peer, changed_htlcs + i)) {
log_broken(peer->log,
peer_internal_error(peer,
"channel_sending_commitsig: update failed");
return -1;
}
@ -848,7 +851,7 @@ int peer_sending_commitsig(struct peer *peer, const u8 *msg)
if (num_local_added != 0) {
if (maxid != peer->next_htlc_id + num_local_added - 1) {
log_broken(peer->log,
peer_internal_error(peer,
"channel_sending_commitsig:"
" Added %"PRIu64", maxid now %"PRIu64
" from %"PRIu64,
@ -944,7 +947,7 @@ static bool peer_sending_revocation(struct peer *peer,
bool peer_save_commitsig_received(struct peer *peer, u64 commitnum)
{
if (commitnum != peer->num_commits_received) {
log_broken(peer->log,
peer_internal_error(peer,
"channel_got_commitsig: expected commitnum %"PRIu64
" got %"PRIu64,
peer->num_commits_received, commitnum);
@ -961,7 +964,7 @@ bool peer_save_commitsig_received(struct peer *peer, u64 commitnum)
bool peer_save_commitsig_sent(struct peer *peer, u64 commitnum)
{
if (commitnum != peer->num_commits_sent) {
log_broken(peer->log,
peer_internal_error(peer,
"channel_sent_commitsig: expected commitnum %"PRIu64
" got %"PRIu64,
peer->num_commits_sent, commitnum);
@ -996,8 +999,9 @@ int peer_got_commitsig(struct peer *peer, const u8 *msg)
&fulfilled,
&failed,
&changed)) {
log_broken(peer->log, "bad fromwire_channel_got_commitsig %s",
tal_hex(peer, msg));
peer_internal_error(peer,
"bad fromwire_channel_got_commitsig %s",
tal_hex(peer, msg));
return -1;
}
@ -1026,8 +1030,8 @@ int peer_got_commitsig(struct peer *peer, const u8 *msg)
for (i = 0; i < tal_count(changed); i++) {
if (!changed_htlc(peer, &changed[i])) {
log_broken(peer->log,
"got_commitsig: update failed");
peer_internal_error(peer,
"got_commitsig: update failed");
return -1;
}
}
@ -1068,8 +1072,8 @@ int peer_got_revoke(struct peer *peer, const u8 *msg)
&revokenum, &per_commitment_secret,
&next_per_commitment_point,
&changed)) {
log_broken(peer->log, "bad fromwire_channel_got_revoke %s",
tal_hex(peer, msg));
peer_internal_error(peer, "bad fromwire_channel_got_revoke %s",
tal_hex(peer, msg));
return -1;
}
@ -1087,23 +1091,23 @@ int peer_got_revoke(struct peer *peer, const u8 *msg)
return -1;
} else {
if (!changed_htlc(peer, &changed[i])) {
log_broken(peer->log,
"got_revoke: update failed");
peer_internal_error(peer,
"got_revoke: update failed");
return -1;
}
}
}
if (revokenum >= (1ULL << 48)) {
log_broken(peer->log, "got_revoke: too many txs %"PRIu64,
revokenum);
peer_internal_error(peer, "got_revoke: too many txs %"PRIu64,
revokenum);
return -1;
}
if (revokenum != peer->num_revocations_received) {
log_broken(peer->log, "got_revoke: expected %"PRIu64
" got %"PRIu64,
peer->num_revocations_received, revokenum);
peer_internal_error(peer, "got_revoke: expected %"PRIu64
" got %"PRIu64,
peer->num_revocations_received, revokenum);
return -1;
}
@ -1115,10 +1119,12 @@ int peer_got_revoke(struct peer *peer, const u8 *msg)
if (!shachain_add_hash(&peer->their_shachain,
shachain_index(revokenum),
&per_commitment_secret)) {
peer_fail(peer, "Bad per_commitment_secret %s for %"PRIu64,
type_to_string(msg, struct sha256,
&per_commitment_secret),
revokenum);
char *err = tal_fmt(peer,
"Bad per_commitment_secret %s for %"PRIu64,
type_to_string(msg, struct sha256,
&per_commitment_secret),
revokenum);
peer_fail_permanent(peer, take((u8 *)err));
return -1;
}