mirror of
https://github.com/ElementsProject/lightning.git
synced 2025-02-22 22:45:27 +01:00
connectd: don't start connecting in parallel in peer_conn_closed.
The crash below from @zerofeerouting left me confused. The invalid
value in fmt_wireaddr_internal is a telltale sign of use-after-free.
This backtrace shows us destroying the conn *twice*: what's happening?
Well, tal carefully protects against destroying twice: it's not that
unusual to free something in a destructor which has already been freed.
So this indicates that there are *two* io_conn hanging off one
struct connecting, which isn't supposed to happen! We deliberately
call try_connect_one_addr() initially, then inside the io_conn destructor.
But due to races in connectd vs lightningd connection state, we added
a fix which allows a connect command to sit around while the peer is
cleaning up (6cc9f37cab
) and get fired
off when it's done.
But what if, in the chaos, we are already connecting again? Now we'll
end up with *two* connections.
Fortunately, we have a `conn` pointer inside struct connecting, which
(with a bit of additional care) we can ensure is only non-NULL while
we're actually trying to connect. This lets us check that before
firing off a new connection attempt in peer_conn_closed.
```
lightning_connectd: FATAL SIGNAL 6 (version v0.11.2rc2-2-g8f7e939)
0x5614a4915ae8 send_backtrace
common/daemon.c:33
0x5614a4915b72 crashdump
common/daemon.c:46
0x7ffa14fcd72f ???
???:0
0x7ffa14dc87bb ???
???:0
0x7ffa14db3534 ???
???:0
0x5614a491fc71 fmt_wireaddr_internal
common/wireaddr.c:255
0x5614a491fc7a fmt_wireaddr_internal_
common/wireaddr.c:257
0x5614a491ea6b type_to_string_
common/type_to_string.c:32
0x5614a490beaa destroy_io_conn
connectd/connectd.c:754
0x5614a494a2f1 destroy_conn
ccan/ccan/io/poll.c:246
0x5614a494a313 destroy_conn_close_fd
ccan/ccan/io/poll.c:252
0x5614a4953804 notify
ccan/ccan/tal/tal.c:240
0x5614a49538d6 del_tree
ccan/ccan/tal/tal.c:402
0x5614a4953928 del_tree
ccan/ccan/tal/tal.c:412
0x5614a4953e07 tal_free
ccan/ccan/tal/tal.c:486
0x5614a4908b7a try_connect_one_addr
connectd/connectd.c:870
0x5614a490bef1 destroy_io_conn
connectd/connectd.c:759
0x5614a494a2f1 destroy_conn
ccan/ccan/io/poll.c:246
0x5614a494a313 destroy_conn_close_fd
ccan/ccan/io/poll.c:252
0x5614a4953804 notify
ccan/ccan/tal/tal.c:240
0x5614a49538d6 del_tree
ccan/ccan/tal/tal.c:402
0x5614a4953e07 tal_free
ccan/ccan/tal/tal.c:486
0x5614a4948f08 io_close
ccan/ccan/io/io.c:450
0x5614a4948f59 do_plan
ccan/ccan/io/io.c:401
0x5614a4948fe1 io_ready
ccan/ccan/io/io.c:417
0x5614a494a8e6 io_loop
ccan/ccan/io/poll.c:453
0x5614a490c12f main
connectd/connectd.c:2164
0x7ffa14db509a ???
???:0
0x5614a4904e99 ???
???:0
0xffffffffffffffff ???
???:0
```
Fixes: #5339
Changelog-Fixed: connectd: occasional crash when we reconnect to a peer quickly.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
parent
8e2dcc1167
commit
4ee55acc71
1 changed files with 4 additions and 3 deletions
|
@ -748,6 +748,7 @@ static void destroy_io_conn(struct io_conn *conn, struct connecting *connect)
|
|||
&connect->addrs[connect->addrnum]),
|
||||
connect->connstate, errstr));
|
||||
connect->addrnum++;
|
||||
connect->conn = NULL;
|
||||
try_connect_one_addr(connect);
|
||||
}
|
||||
|
||||
|
@ -849,8 +850,7 @@ static void try_connect_one_addr(struct connecting *connect)
|
|||
struct sockaddr_in6 *sa6;
|
||||
#endif
|
||||
|
||||
/* In case we fail without a connection, make destroy_io_conn happy */
|
||||
connect->conn = NULL;
|
||||
assert(!connect->conn);
|
||||
|
||||
/* Out of addresses? */
|
||||
if (connect->addrnum == tal_count(connect->addrs)) {
|
||||
|
@ -1892,6 +1892,7 @@ static void try_connect_peer(struct daemon *daemon,
|
|||
connect->seconds_waited = seconds_waited;
|
||||
connect->addrhint = tal_steal(connect, addrhint);
|
||||
connect->errors = tal_strdup(connect, "");
|
||||
connect->conn = NULL;
|
||||
list_add_tail(&daemon->connecting, &connect->list);
|
||||
tal_add_destructor(connect, destroy_connecting);
|
||||
|
||||
|
@ -1944,7 +1945,7 @@ void peer_conn_closed(struct peer *peer)
|
|||
tal_free(peer);
|
||||
|
||||
/* If we wanted to connect to it, but found it was exiting, try again */
|
||||
if (connect)
|
||||
if (connect && !connect->conn)
|
||||
try_connect_one_addr(connect);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue