Subject: kern/18770: in_multiaddrs should be hashed instead of chained on in_ifaddr
To: None <gnats-bugs@gnats.netbsd.org>
From: Nick <naamato@nexthop.com>
List: netbsd-bugs
Date: 10/22/2002 15:37:59
>Number:         18770
>Category:       kern
>Synopsis:       in_multiaddrs should be hashed instead of chained on in_ifaddr
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    kern-bug-people
>State:          open
>Class:          change-request
>Submitter-Id:   net
>Arrival-Date:   Tue Oct 22 12:39:00 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator:     Nick Amato
>Release:        current 20021022 <NetBSD-current source date>
>Organization:
NextHop Technologies
>Environment:
	all
	
System:  all

>Description:

The current IPv4 multicast group code has a few disadvantages:

1) The requirement of kludge code to stash outstanding multicast group
   joins when their parent in_ifaddr goes away.

2) Poor scalability: the groups are stored in simple linked list.

3) Inability to join multicast groups on interfaces without an IPv4 address.
   Given that ip_multicast_if() can select an interface using the
   lower 24 bits of the arg to setsockopt(IP_ADD_MEMBERSHIP) and
   friends, it makes sense to allow receipt of multicast datagrams on
   interfaces without an address.

A solution is to hash these addresses just like other IPv4 addresses,
in their own hash table.  The attached patch accomplishes this.  All
three problems above are fixed;  the kludge code goes away, the multicast
address lookups are typically faster, and groups may be joined on unnumbered
interfaces.

I have tested it and it seems to work fine, however, suggestions
are welcome.

>How-To-Repeat:
	ifconfig gif0 create tunnel x.x.x.x y.y.y.y up
	setsockopt(IP_ADD_MEMBERSHIP) -> in_addmulti() fails
>Fix:

Index: in.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/in.c,v
retrieving revision 1.81
diff -u -r1.81 in.c
--- in.c	2002/10/22 02:28:47	1.81
+++ in.c	2002/10/22 18:26:48
@@ -152,13 +152,6 @@
 int hostzeroisbroadcast = HOSTZEROBROADCAST;
 
 /*
- * This list is used to keep track of in_multi chains which belong to
- * deleted interface addresses.  We use in_ifaddr so that a chain head
- * won't be deallocated until all multicast address record are deleted.
- */
-static TAILQ_HEAD(, in_ifaddr) in_mk = TAILQ_HEAD_INITIALIZER(in_mk);
-
-/*
  * Return 1 if an internet address is for a ``local'' host
  * (one to which we have a connection).  If subnetsarelocal
  * is true, this includes other subnets of the local net.
@@ -400,7 +393,6 @@
 				ia->ia_broadaddr.sin_family = AF_INET;
 			}
 			ia->ia_ifp = ifp;
-			LIST_INIT(&ia->ia_multiaddrs);
 		}
 		break;
 
@@ -551,13 +543,6 @@
 	TAILQ_REMOVE(&in_ifaddr, ia, ia_list);
 	if (ia->ia_allhosts != NULL)
 		in_delmulti(ia->ia_allhosts);
-	if (LIST_FIRST(&ia->ia_multiaddrs) != NULL &&
-	    /*
-	     * If the interface is going away, don't bother to save
-	     * the multicast entries.
-	     */
-	    ifp->if_output != if_nulloutput)
-		in_savemkludge(ia);
 	IFAFREE(&ia->ia_ifa);
 	in_setmaxmtu();
 }
@@ -574,7 +559,6 @@
 			continue;
 		in_purgeaddr(ifa, ifp);
 	}
-	in_purgemkludge(ifp);
 }
 
 /*
@@ -854,11 +838,7 @@
 		flags |= RTF_HOST;
 	}
 	error = in_addprefix(ia, flags);
-	/*
-	 * recover multicast kludge entry, if there is.
-	 */
-	if (ifp->if_flags & IFF_MULTICAST)
-		in_restoremkludge(ia, ifp);
+
 	/*
 	 * If the interface supports multicast, join the "all hosts"
 	 * multicast group on that interface.
@@ -1033,91 +1013,6 @@
 }
 
 /*
- * Multicast address kludge:
- * If there were any multicast addresses attached to this interface address,
- * either move them to another address on this interface, or save them until
- * such time as this interface is reconfigured for IPv4.
- */
-void
-in_savemkludge(oia)
-	struct in_ifaddr *oia;
-{
-	struct in_ifaddr *ia;
-	struct in_multi *inm, *next;
-
-	IFP_TO_IA(oia->ia_ifp, ia);
-	if (ia) {	/* there is another address */
-		for (inm = LIST_FIRST(&oia->ia_multiaddrs); inm; inm = next){
-			next = LIST_NEXT(inm, inm_list);
-			LIST_REMOVE(inm, inm_list);
-			IFAFREE(&inm->inm_ia->ia_ifa);
-			IFAREF(&ia->ia_ifa);
-			inm->inm_ia = ia;
-			LIST_INSERT_HEAD(&ia->ia_multiaddrs, inm, inm_list);
-		}
-	} else {	/* last address on this if deleted, save */
-		TAILQ_INSERT_TAIL(&in_mk, oia, ia_list);
-		IFAREF(&oia->ia_ifa);
-	}
-}
-
-/*
- * Continuation of multicast address hack:
- * If there was a multicast group list previously saved for this interface,
- * then we re-attach it to the first address configured on the i/f.
- */
-void
-in_restoremkludge(ia, ifp)
-	struct in_ifaddr *ia;
-	struct ifnet *ifp;
-{
-	struct in_ifaddr *oia;
-
-	for (oia = TAILQ_FIRST(&in_mk); oia != NULL;
-	    oia = TAILQ_NEXT(oia, ia_list)) {
-		if (oia->ia_ifp == ifp) {
-			struct in_multi *inm, *next;
-
-			for (inm = LIST_FIRST(&oia->ia_multiaddrs);
-			    inm != NULL; inm = next) {
-				next = LIST_NEXT(inm, inm_list);
-				LIST_REMOVE(inm, inm_list);
-				IFAFREE(&inm->inm_ia->ia_ifa);
-				IFAREF(&ia->ia_ifa);
-				inm->inm_ia = ia;
-				LIST_INSERT_HEAD(&ia->ia_multiaddrs,
-				    inm, inm_list);
-			}
-	    		TAILQ_REMOVE(&in_mk, oia, ia_list);
-			IFAFREE(&oia->ia_ifa);
-			break;
-		}
-	}
-}
-
-void
-in_purgemkludge(ifp)
-	struct ifnet *ifp;
-{
-	struct in_ifaddr *oia;
-
-	for (oia = TAILQ_FIRST(&in_mk); oia != NULL;
-	    oia = TAILQ_NEXT(oia, ia_list)) {
-		if (oia->ia_ifp != ifp)
-			continue;
-
-		/*
-		 * Leaving from all multicast groups joined through
-		 * this interface is done via in_pcbpurgeif().
-		 */
-
-	    	TAILQ_REMOVE(&in_mk, oia, ia_list);
-		IFAFREE(&oia->ia_ifa);
-		break;
-	}
-}
-
-/*
  * Add an address to the list of IP multicast addresses for a given interface.
  */
 struct in_multi *
@@ -1127,7 +1022,6 @@
 {
 	struct in_multi *inm;
 	struct ifreq ifr;
-	struct in_ifaddr *ia;
 	int s = splsoftnet();
 
 	/*
@@ -1142,7 +1036,7 @@
 	} else {
 		/*
 		 * New address; allocate a new multicast record
-		 * and link it into the interface's multicast list.
+		 * and link it into the multicast hash table
 		 */
 		inm = (struct in_multi *)malloc(sizeof(*inm),
 		    M_IPMADDR, M_NOWAIT);
@@ -1153,15 +1047,10 @@
 		inm->inm_addr = *ap;
 		inm->inm_ifp = ifp;
 		inm->inm_refcount = 1;
-		IFP_TO_IA(ifp, ia);
-		if (ia == NULL) {
-			free(inm, M_IPMADDR);
-			splx(s);
-			return (NULL);
-		}
-		inm->inm_ia = ia;
-		IFAREF(&inm->inm_ia->ia_ifa);
-		LIST_INSERT_HEAD(&ia->ia_multiaddrs, inm, inm_list);
+
+		LIST_INSERT_HEAD(&IN_MADDR_HASH(inm->inm_addr.s_addr),
+		    inm, inm_hash);
+
 		/*
 		 * Ask the network driver to update its multicast reception
 		 * filter appropriately for the new address.
@@ -1171,7 +1060,7 @@
 		satosin(&ifr.ifr_addr)->sin_addr = *ap;
 		if ((ifp->if_ioctl == NULL) ||
 		    (*ifp->if_ioctl)(ifp, SIOCADDMULTI,(caddr_t)&ifr) != 0) {
-			LIST_REMOVE(inm, inm_list);
+			LIST_REMOVE(inm, inm_hash);
 			free(inm, M_IPMADDR);
 			splx(s);
 			return (NULL);
@@ -1204,8 +1093,7 @@
 		/*
 		 * Unlink from list.
 		 */
-		LIST_REMOVE(inm, inm_list);
-		IFAFREE(&inm->inm_ia->ia_ifa);
+		LIST_REMOVE(inm, inm_hash);
 		/*
 		 * Notify the network driver to update its multicast reception
 		 * filter.
Index: in_var.h
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/in_var.h,v
retrieving revision 1.44
diff -u -r1.44 in_var.h
--- in_var.h	2002/05/12 20:33:50	1.44
+++ in_var.h	2002/10/22 18:26:48
@@ -99,7 +99,6 @@
 	struct	sockaddr_in ia_dstaddr;	/* reserve space for broadcast addr */
 #define	ia_broadaddr	ia_dstaddr
 	struct	sockaddr_in ia_sockmask; /* reserve space for general netmask */
-	LIST_HEAD(, in_multi) ia_multiaddrs; /* list of multicast addresses */
 	struct	in_multi *ia_allhosts;	/* multicast address record for
 					   the allhosts multicast group */
 };
@@ -122,6 +121,9 @@
 #ifndef IN_IFADDR_HASH_SIZE
 #define IN_IFADDR_HASH_SIZE 509	/* 61, 127, 251, 509, 1021, 2039 are good */
 #endif
+#ifndef IN_MADDR_HASH_SIZE
+#define IN_MADDR_HASH_SIZE 509	/* 61, 127, 251, 509, 1021, 2039 are good */
+#endif
 
 /*
  * This is a bit unconventional, and wastes a little bit of space, but
@@ -130,15 +132,22 @@
  */
 
 #define	IN_IFADDR_HASH(x) in_ifaddrhashtbl[(u_long)(x) % IN_IFADDR_HASH_SIZE]
+#define	IN_MADDR_HASH(x)  in_maddrhashtbl[(u_long)(x) % IN_MADDR_HASH_SIZE]
 
-LIST_HEAD(in_ifaddrhashhead, in_ifaddr);	/* Type of the hash head */
-TAILQ_HEAD(in_ifaddrhead, in_ifaddr);		/* Type of the list head */
+/*
+ * types for multicast and interface address hashes and list heads
+ */
+LIST_HEAD(in_ifaddrhashhead, in_ifaddr);
+TAILQ_HEAD(in_ifaddrhead, in_ifaddr);
+LIST_HEAD(in_maddrhashhead, in_multi);
 
 extern	u_long in_ifaddrhash;			/* size of hash table - 1 */
-extern	int	in_ifaddrentries;		/* total number of addrs */
 extern  struct in_ifaddrhashhead *in_ifaddrhashtbl;	/* Hash table head */
 extern  struct in_ifaddrhead in_ifaddr;		/* List head (in ip_input) */
 
+extern	u_long in_maddrhash;			/* size of hash table - 1 */
+extern  struct in_maddrhashhead *in_maddrhashtbl;	/* Hash table head */
+
 extern	struct	ifqueue	ipintrq;		/* ip packet input queue */
 extern	const	int	inetctlerrmap[];
 
@@ -228,10 +237,9 @@
 struct in_multi {
 	struct	in_addr inm_addr;	/* IP multicast address */
 	struct	ifnet *inm_ifp;		/* back pointer to ifnet */
-	struct	in_ifaddr *inm_ia;	/* back pointer to in_ifaddr */
 	u_int	inm_refcount;		/* no. membership claims by sockets */
 	u_int	inm_timer;		/* IGMP membership report timer */
-	LIST_ENTRY(in_multi) inm_list;	/* list of multicast addresses */
+	LIST_ENTRY(in_multi) inm_hash;	/* list of multicast addresses */
 	u_int	inm_state;		/* state of membership */
 	struct	router_info *inm_rti;	/* router version info */
 };
@@ -242,7 +250,7 @@
  * all of the in_multi records.
  */
 struct in_multistep {
-	struct in_ifaddr *i_ia;
+	struct in_maddrhashhead *i_hh;
 	struct in_multi *i_inm;
 };
 
@@ -255,16 +263,11 @@
 	/* struct ifnet *ifp; */ \
 	/* struct in_multi *inm; */ \
 { \
-	struct in_ifaddr *ia; \
-\
-	IFP_TO_IA((ifp), ia); 			/* multicast */ \
-	if (ia == NULL) \
-		(inm) = NULL; \
-	else \
-		for ((inm) = LIST_FIRST(&ia->ia_multiaddrs); \
-		    (inm) != NULL && !in_hosteq((inm)->inm_addr, (addr)); \
-		     (inm) = LIST_NEXT((inm), inm_list)) \
-			 continue; \
+	LIST_FOREACH(inm, &IN_MADDR_HASH((addr).s_addr), inm_hash) { \
+		if (in_hosteq((inm)->inm_addr, (addr)) && \
+		    (inm)->inm_ifp == ifp) \
+			break; \
+	} \
 }
 
 /*
@@ -279,13 +282,13 @@
 	/* struct in_multi *inm; */ \
 { \
 	if (((inm) = (step).i_inm) != NULL) \
-		(step).i_inm = LIST_NEXT((inm), inm_list); \
+		(step).i_inm = LIST_NEXT((inm), inm_hash); \
 	else \
-		while ((step).i_ia != NULL) { \
-			(inm) = LIST_FIRST(&(step).i_ia->ia_multiaddrs); \
-			(step).i_ia = TAILQ_NEXT((step).i_ia, ia_list); \
+		while ((step).i_hh != in_maddrhashtbl + IN_MADDR_HASH_SIZE) { \
+			(inm) = LIST_FIRST((step).i_hh); \
+			(step).i_hh++; \
 			if ((inm) != NULL) { \
-				(step).i_inm = LIST_NEXT((inm), inm_list); \
+				(step).i_inm = LIST_NEXT((inm), inm_hash); \
 				break; \
 			} \
 		} \
@@ -295,7 +298,7 @@
 	/* struct in_multistep step; */ \
 	/* struct in_multi *inm; */ \
 { \
-	(step).i_ia = TAILQ_FIRST(&in_ifaddr); \
+	(step).i_hh = &in_maddrhashtbl[0]; \
 	(step).i_inm = NULL; \
 	IN_NEXT_MULTI((step), (inm)); \
 }
@@ -304,9 +307,6 @@
 
 int	in_ifinit __P((struct ifnet *,
 	    struct in_ifaddr *, struct sockaddr_in *, int));
-void	in_savemkludge __P((struct in_ifaddr *));
-void	in_restoremkludge __P((struct in_ifaddr *, struct ifnet *));
-void	in_purgemkludge __P((struct ifnet *));
 struct	in_multi *in_addmulti __P((struct in_addr *, struct ifnet *));
 void	in_delmulti __P((struct in_multi *));
 void	in_ifscrub __P((struct ifnet *, struct in_ifaddr *));
Index: ip_input.c
===================================================================
RCS file: /cvsroot/syssrc/sys/netinet/ip_input.c,v
retrieving revision 1.158
diff -u -r1.158 ip_input.c
--- ip_input.c	2002/09/23 13:43:27	1.158
+++ ip_input.c	2002/10/22 18:26:48
@@ -202,9 +202,10 @@
 extern	struct domain inetdomain;
 int	ipqmaxlen = IFQ_MAXLEN;
 u_long	in_ifaddrhash;				/* size of hash table - 1 */
-int	in_ifaddrentries;			/* total number of addrs */
+u_long	in_maddrhash;				/* size of hash table - 1 */
 struct	in_ifaddrhead in_ifaddr;
 struct	in_ifaddrhashhead *in_ifaddrhashtbl;
+struct	in_maddrhashhead *in_maddrhashtbl;
 struct	ifqueue ipintrq;
 struct	ipstat	ipstat;
 u_int16_t	ip_id;
@@ -338,6 +339,8 @@
 	TAILQ_INIT(&in_ifaddr);
 	in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR,
 	    M_WAITOK, &in_ifaddrhash);
+	in_maddrhashtbl = hashinit(IN_MADDR_HASH_SIZE, HASH_LIST, M_IPMADDR,
+	    M_WAITOK, &in_maddrhash);
 	if (ip_mtudisc != 0)
 		ip_mtudisc_timeout_q =
 		    rt_timer_queue_create(ip_mtudisc_timeout);
>Release-Note:
>Audit-Trail:
>Unformatted: