tech-kern archive

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]

Re: Question about tcp ephemeral ports



Hi,

Attached is a patch that makes my small test program working. I
applies to 5.1 and 5.1.1 only. Porting it to current would be a bit
harder due to the port randomization, as described by Eric
previously.

This is just a proof of concept and I would be happy to have some
feedback about how to write it better and what are the potential
issues.

Olivier
From 61c4012c89cd088f8f6e3f16f5e1306104232b28 Mon Sep 17 00:00:00 2001
From: Olivier Matz <olivier.matz%6wind.com@localhost>
Date: Thu, 2 Feb 2012 16:51:05 +0100
Subject: tcp: allow to reuse an ephemeral port if dest addr/port is different

When a TCP client calls connect(), an implicit bind is done by the
network stack to choose an ephemeral port. Currently, there is a
limitation that prevent the tcp client to open many ephemeral ports even
if the destination port or address is different.

The problem is described in details there:
http://mail-index.netbsd.org/tech-kern/2012/01/30/msg012602.html

The goal of this patch is to allow duplicate the code of in_pcbbind() in
a new function in_pcbbind_before_connect() that is called specifically
by the TCP connect code when doing an implicit bind. The behaviour is a
bit different compared to the initial in_pcbbind():

- only the (nam == NULL) case is allowed
- the function is aware of remote address that will be given to the
  connect(). The duplication of the ephemeral port is checked by a
  in_pcblookup_connect() instead of a in_pcblookup_port().
- the socket state is not changed to BOUND (but the the pcb is added in
  the INPCBHASH_PORT table). The connect() will change the state to
  CONNECTED if it is successful.

If the in_pcbconnect() fails, we need to restore the initial state:
inp->in_port to 0, tcp in INPCBHASH_PORT table[0], remove INP_ANONPORT
flag.

Note: this patch is just a proof of concept and should probably be
cleaned and enhanced. Currently, only IPv4 is done.
---
 netinet/in_pcb.c     |   88 ++++++++++++++++++++++++++++++++++++++++++++++++++
 netinet/in_pcb.h     |    2 +
 netinet/tcp_usrreq.c |   10 +++++-
 3 files changed, 99 insertions(+), 1 deletions(-)

diff --git a/netinet/in_pcb.c b/netinet/in_pcb.c
index 5d662ce..498a344 100644
--- a/netinet/in_pcb.c
+++ b/netinet/in_pcb.c
@@ -371,6 +371,94 @@ noname:
        return (0);
 }
 
+int
+in_pcbbind_before_connect(void *v, struct in_addr raddr,
+          u_int rport, struct lwp *l)
+{
+       struct inpcb *inp = v;
+       struct socket *so = inp->inp_socket;
+       struct inpcbtable *table = inp->inp_table;
+       struct sockaddr_in *sin = NULL; /* XXXGCC */
+       u_int16_t lport = 0;
+#ifndef IPNOPRIVPORTS
+       kauth_cred_t cred = l->l_cred;
+#endif
+       int        cnt;
+       u_int16_t  mymin, mymax;
+       u_int16_t *lastport;
+
+       if (inp->inp_af != AF_INET)
+               return (EINVAL);
+
+       if (TAILQ_FIRST(&in_ifaddrhead) == 0)
+               return (EADDRNOTAVAIL);
+       if (inp->inp_lport || !in_nullhost(inp->inp_laddr))
+               return (EINVAL);
+
+       if (inp->inp_flags & INP_LOWPORT) {
+#ifndef IPNOPRIVPORTS
+               if (kauth_authorize_network(cred,
+                                           KAUTH_NETWORK_BIND,
+                                           KAUTH_REQ_NETWORK_BIND_PRIVPORT, so,
+                                           sin, NULL))
+                       return (EACCES);
+#endif
+               mymin = lowportmin;
+               mymax = lowportmax;
+               lastport = &table->inpt_lastlow;
+       } else {
+               mymin = anonportmin;
+               mymax = anonportmax;
+               lastport = &table->inpt_lastport;
+       }
+       if (mymin > mymax) {    /* sanity check */
+               u_int16_t swp;
+
+               swp = mymin;
+               mymin = mymax;
+               mymax = swp;
+       }
+
+       lport = *lastport - 1;
+       for (cnt = mymax - mymin + 1; cnt; cnt--, lport--) {
+               if (lport < mymin || lport > mymax)
+                       lport = mymax;
+               if (!in_pcblookup_connect(table, inp->inp_laddr,
+                                         htons(lport), raddr, htons(rport)))
+                       goto found;
+       }
+       if (!in_nullhost(inp->inp_laddr))
+               inp->inp_laddr.s_addr = INADDR_ANY;
+       return (EAGAIN);
+
+ found:
+       inp->inp_flags |= INP_ANONPORT;
+       *lastport = lport;
+       lport = htons(lport);
+
+       inp->inp_lport = lport;
+       LIST_REMOVE(&inp->inp_head, inph_lhash);
+       LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), &inp->inp_head,
+                        inph_lhash);
+
+       return (0);
+}
+
+void
+in_pcbbind_revert(void *v)
+{
+       struct inpcb *inp = v;
+       struct inpcbtable *table = inp->inp_table;
+
+       /* Called from tcp_usrreq if the connect failed after an
+        * implicit bind. This will restore the initial state */
+       inp->inp_flags &= ~INP_ANONPORT;
+       inp->inp_lport = 0;
+       LIST_REMOVE(&inp->inp_head, inph_lhash);
+       LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), &inp->inp_head,
+                        inph_lhash);
+}
+
 /*
  * Connect from a socket to a specified address.
  * Both address and port must be specified in argument sin.
diff --git a/netinet/in_pcb.h b/netinet/in_pcb.h
index 8e1d929..51a0a5c 100644
--- a/netinet/in_pcb.h
+++ b/netinet/in_pcb.h
@@ -125,6 +125,8 @@ struct inpcb {
 void   in_losing(struct inpcb *);
 int    in_pcballoc(struct socket *, void *);
 int    in_pcbbind(void *, struct mbuf *, struct lwp *);
+int    in_pcbbind_before_connect(void *, struct in_addr, u_int, struct lwp *);
+void   in_pcbbind_revert(void *v);
 int    in_pcbconnect(void *, struct mbuf *, struct lwp *);
 void   in_pcbdetach(void *);
 void   in_pcbdisconnect(void *);
diff --git a/netinet/tcp_usrreq.c b/netinet/tcp_usrreq.c
index 46f44c0..b889cdc 100644
--- a/netinet/tcp_usrreq.c
+++ b/netinet/tcp_usrreq.c
@@ -389,11 +389,19 @@ tcp_usrreq(struct socket *so, int req,
 #ifdef INET
                if (inp) {
                        if (inp->inp_lport == 0) {
-                               error = in_pcbbind(inp, (struct mbuf *)0, l);
+                               struct sockaddr_in *sin =
+                                       (struct sockaddr_in *)nam;
+                               error = in_pcbbind_before_connect(inp,
+                                       sin->sin_addr, ntohs(sin->sin_port), l);
                                if (error)
                                        break;
                        }
                        error = in_pcbconnect(inp, nam, l);
+                       if (error != 0) {
+                               /* if connect fails, we need to revert
+                                * bind_before_connect's work */
+                               in_pcbbind_revert(inp);
+                       }
                }
 #endif
 #ifdef INET6
-- 
1.7.7.3



Home | Main Index | Thread Index | Old Index