Subject: load sharing
To: None <tech-net@netbsd.org>
From: Mihai Chelaru <kefren@ngnetworks.ro>
List: tech-net
Date: 11/15/2007 10:55:05
--Boundary-00=_plAPHj0PYRgIqff
Content-Type: text/plain;
  charset="us-ascii"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline

Hi,

Attached it's a patch that allows load sharing over multiple gateways. Right 
now it supports simple ways to do it, controllable via 
net.inet.ip.load-sharing sysctl. Although I've only written the IP part, 
implementing it for other protocols is very simple (currently I have nothing 
else than IP here to test). 

Also, right now it supports adding cloning routes with the same destination to 
different interfaces, meaning you can link two cards to the _same_ ethernet 
domain. ARP resolving in this case will work round-robin but this is subject 
to future changes that will allow having two different cards connected to 
different ethernet domains but using the same IP subnet.

Testers and opinions are highly appreciated :)

-- 
Mihai

P.S. Please CC me

--Boundary-00=_plAPHj0PYRgIqff
Content-Type: text/x-diff;
  charset="us-ascii";
  name="loadsharing-noarp.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
	filename="loadsharing-noarp.patch"

Index: sys/net/route.c
===================================================================
RCS file: /cvsroot/src/sys/net/route.c,v
retrieving revision 1.98
diff -u -p -r1.98 route.c
--- sys/net/route.c	10 Oct 2007 22:14:38 -0000	1.98
+++ sys/net/route.c	15 Nov 2007 08:40:23 -0000
@@ -147,6 +147,7 @@ struct callout rt_timer_ch; /* callout f
 static int _rtcache_debug = 0;
 #endif /* RTFLUSH_DEBUG */
 
+struct rtentry *rtgethead(const struct sockaddr *, const struct sockaddr *);
 static int rtdeletemsg(struct rtentry *);
 static int rtflushclone1(struct rtentry *, void *);
 static void rtflushclone(sa_family_t family, struct rtentry *);
@@ -304,6 +305,34 @@ rtalloc(struct route *ro)
 	rtcache(ro);
 }
 
+/*
+ * Returns rtentry in a RR fashion
+ * rt should be the first path
+ */
+struct rtentry *
+rtchoosepath_rr(struct rtentry *rt)
+{
+	rt->rt_last = rtnext(rt->rt_last);
+	return rt->rt_last;
+}
+
+/*
+ * Next rtentry that it's UP (in case there is such thing)
+ * If none is found return the feeded rtentry
+ */
+struct rtentry *
+rtnext(struct rtentry *rt)
+{
+	struct rtentry *retrt, *sentinel;
+
+	KASSERT(rt != NULL);
+	CLIST_FOREACH(retrt, CLIST_NEXT(rt, rt_list), sentinel, rt_list)
+	    if (retrt->rt_flags & RTF_UP)
+		return retrt;
+
+	return rt;
+}
+
 struct rtentry *
 rtalloc1(const struct sockaddr *dst, int report)
 {
@@ -355,28 +384,81 @@ rtalloc1(const struct sockaddr *dst, int
 	return newrt;
 }
 
+/*
+ * returns head of the list
+ * just a rnh_lookup wrapper
+ */
+struct rtentry *
+rtgethead(const struct sockaddr *dst, const struct sockaddr *netmask)
+{
+	struct radix_node_head *rnh = rt_tables[dst->sa_family];
+	struct rtentry *rt = NULL;
+	struct radix_node *rn;
+	int  s = splsoftnet();
+
+	if (rnh && (rn = rnh->rnh_lookup(dst, netmask, rnh)) &&
+	    ((rn->rn_flags & RNF_ROOT) == 0))
+		rt = (struct rtentry *)rn;
+		else
+		rtstat.rts_unreach++;
+
+	splx(s);
+	return rt;
+}
+
 void
 rtfree(struct rtentry *rt)
 {
-	struct ifaddr *ifa;
+	struct rtentry *rthead;
 
 	if (rt == NULL)
 		panic("rtfree");
 	rt->rt_refcnt--;
 	if (rt->rt_refcnt <= 0 && (rt->rt_flags & RTF_UP) == 0) {
-		if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
-			panic ("rtfree 2");
 		rttrash--;
 		if (rt->rt_refcnt < 0) {
 			printf("rtfree: %p not freed (neg refs)\n", rt);
 			return;
 		}
+		rthead = RTFIRST(rt);
+		rthead->rt_total--;
+		if (rthead->rt_total == 0 &&
+		    (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)))
+			panic("rtfree 2");
 		rt_timer_remove_all(rt, 0);
-		ifa = rt->rt_ifa;
-		rt->rt_ifa = NULL;
-		IFAFREE(ifa);
-		rt->rt_ifp = NULL;
-		rt_destroy(rt);
+		IFAFREE(rt->rt_ifa);
+		if (rthead->rt_total == 0) {
+			/* No other paths */
+			rt_destroy(rt);
+		} else if (rthead == rt) {
+			/* First GW to delete from more */
+			struct radix_node_head *rnh;
+			struct rtentry *srt = CLIST_NEXT(rthead, rt_list),
+				*sen, *rtin;
+			KASSERT(rt != srt);
+			srt->rt_total = rt->rt_total;
+			srt->rt_last = srt;
+			CLIST_REMOVE(rt, rt_list);
+			if ((rnh = rt_tables[rt_getkey(rt)->sa_family]) == NULL)
+				panic("rtfree: rt_tables");
+			if (rnh->rnh_deladdr(rt_getkey(rt), rt_mask(rt), rnh) == NULL)
+				panic("rtfree: deladdr");
+			if (rnh->rnh_addaddr(rt_getkey(srt), rt_mask(srt), rnh,
+			    srt->rt_nodes) == NULL)
+				panic("rtfree: addaddr");
+			CLIST_FOREACH(rtin, srt, sen, rt_list)
+				RTFIRST(rtin) = srt;
+		} else {
+			/* Delete a non-first path */
+			CLIST_REMOVE(rt, rt_list);
+			if (rthead->rt_last == rt)
+				rthead->rt_last = rthead;
+		}
+
+		if (rt->rt_gateway != NULL)
+			sockaddr_free(rt->rt_gateway);
+		/* do I really need this ? I also Bzero at pool_get */
+		Bzero(rt, sizeof(*rt));
 		pool_put(&rtentry_pool, rt);
 	}
 }
@@ -427,20 +509,33 @@ rtredirect(const struct sockaddr *dst, c
 		error = ENETUNREACH;
 		goto out;
 	}
-	rt = rtalloc1(dst, 0);
 	/*
-	 * If the redirect isn't from our current router for this dst,
-	 * it's either old or wrong.  If it redirects us to ourselves,
-	 * we have a routing loop, perhaps as a result of an interface
-	 * going down recently.
+	 * If it redirects us to ourselves we have a routing loop,
+	 * perhaps as a result of an interface going down recently.
 	 */
-	if (!(flags & RTF_DONE) && rt &&
-	     (!equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
-		error = EINVAL;
-	else if (ifa_ifwithaddr(gateway))
+	if (ifa_ifwithaddr(gateway)) {
 		error = EHOSTUNREACH;
-	if (error)
-		goto done;
+		goto out;
+	}
+	rt = rtalloc1(dst, 0);
+	if (rt && !(flags & RTF_DONE)) {
+		/*
+	 	 * If the redirect isn't from our current router for this dst,
+	 	 * it's either old or wrong. Also calibrate rt.
+	 	 */
+		struct rtentry *sentinel, *nrt;
+		CLIST_FOREACH(nrt, rt, sentinel, rt_list)
+		    if(equal(src, nrt->rt_gateway) && (nrt->rt_ifa == ifa))
+			break;
+		if(nrt == NULL) {
+			error = EINVAL;
+			goto done;
+		}
+		rt->rt_refcnt--;
+		nrt->rt_refcnt++;
+		rt = nrt;
+	}
+
 	/*
 	 * Create a new entry if we just got back a wildcard entry
 	 * or the lookup failed.  This is necessary for hosts
@@ -485,6 +580,7 @@ rtredirect(const struct sockaddr *dst, c
 		}
 	} else
 		error = EHOSTUNREACH;
+
 done:
 	if (rt) {
 		if (rtp != NULL && !error)
@@ -674,7 +770,7 @@ rtrequest1(int req, struct rt_addrinfo *
 {
 	int s = splsoftnet();
 	int error = 0;
-	struct rtentry *rt, *crt;
+	struct rtentry *rt, *crt = NULL, *sentinel, *nrt;
 	struct radix_node *rn;
 	struct radix_node_head *rnh;
 	struct ifaddr *ifa;
@@ -698,16 +794,45 @@ rtrequest1(int req, struct rt_addrinfo *
 		}
 		if ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL)
 			senderr(ESRCH);
-		rt = (struct rtentry *)rn;
+		crt = rt = (struct rtentry *)rn;
+		/* Calibrate */
+		if (gateway != NULL && !(crt->rt_flags & RTF_CLONING)) {
+			/*
+			 * XXX: we can have a gateway on cloning route
+			 */
+			CLIST_FOREACH(rt, crt, sentinel, rt_list)
+			    if (sockaddr_cmp(gateway, rt->rt_gateway) == 0)
+				break;
+			if (rt == NULL)
+				senderr(ESRCH);
+		} else
+			if (! CLIST_SINGULAR(crt, rt_list)) {
+				/*
+				 * If gateway is not provided when
+				 * multiple paths exist check if it's a cloning
+				 * route and try to match ifp
+				 */
+				if ( (crt->rt_flags & RTF_CLONING) == 0 ||
+				    !(info->rti_ifa)) 
+				    senderr(EINVAL);
+				CLIST_FOREACH(rt, crt, sentinel, rt_list)
+				    if (rt->rt_ifp == info->rti_ifa->ifa_ifp)
+					break;
+				if (rt == NULL)
+				    senderr(EINVAL);
+			}
+		if (CLIST_SINGULAR(rt, rt_list)) {
+			if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL)
+				senderr(ESRCH);
+			if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
+				panic("rtrequest delete");
+		}
 		if ((rt->rt_flags & RTF_CLONING) != 0) {
 			/* clean up any cloned children */
 			rtflushclone(dst->sa_family, rt);
 		}
-		if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL)
-			senderr(ESRCH);
-		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
-			panic ("rtrequest delete");
-		rt = (struct rtentry *)rn;
+		if (rt->rt_nodes->rn_flags & RNF_ROOT)
+			panic("rtrequest delete 2");
 		if (rt->rt_gwroute) {
 			RTFREE(rt->rt_gwroute);
 			rt->rt_gwroute = NULL;
@@ -733,6 +858,13 @@ rtrequest1(int req, struct rt_addrinfo *
 			senderr(EINVAL);
 		if ((rt->rt_flags & RTF_CLONING) == 0)
 			senderr(EINVAL);
+		/*
+		 * See if we have more than one cloning route
+		 * and use them round-robinly
+		 * XXX: this will change
+		 */
+		if (!CLIST_SINGULAR(rt, rt_list))
+			rt = rtchoosepath_rr(rt);
 		ifa = rt->rt_ifa;
 		flags = rt->rt_flags & ~(RTF_CLONING | RTF_STATIC);
 		flags |= RTF_CLONED;
@@ -781,26 +913,60 @@ rtrequest1(int req, struct rt_addrinfo *
 			rt->rt_parent = *ret_nrt;
 			rt->rt_parent->rt_refcnt++;
 		}
+		rt->rt_total = 1;
+		rt->rt_first = rt;
+		rt->rt_last = rt;
+		CLIST_INIT(rt, rt_list);
 		RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
 		    __LINE__, (void *)rt->_rt_key);
 		rn = rnh->rnh_addaddr(rt_getkey(rt), netmask, rnh,
 		    rt->rt_nodes);
 		RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
 		    __LINE__, (void *)rt->_rt_key);
-		if (rn == NULL && (crt = rtalloc1(rt_getkey(rt), 0)) != NULL) {
+		if (rn == NULL &&
+		    ((crt = rtgethead(rt_getkey(rt), NULL)) != NULL) &&
 			/* overwrite cloned route */
-			if ((crt->rt_flags & RTF_CLONED) != 0) {
-				rtdeletemsg(crt);
-				rn = rnh->rnh_addaddr(rt_getkey(rt),
-				    netmask, rnh, rt->rt_nodes);
+		    ((crt->rt_flags & RTF_CLONED) != 0)) {
+			rtdeletemsg(crt);
+			rn = rnh->rnh_addaddr(rt_getkey(rt),
+			    netmask, rnh, rt->rt_nodes);
+			crt = NULL;
+			if (rn == NULL) {
+				error = ENOMEM;
+				goto eexist;
 			}
-			RTFREE(crt);
 			RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
-			    __LINE__, (void *)rt->_rt_key);
+		    	    __LINE__, (void *)rt->_rt_key);
 		}
-		RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
-		    __LINE__, (void *)rt->_rt_key);
-		if (rn == NULL) {
+		else if (req == RTM_ADD && rn == NULL &&
+		    ((crt = rtgethead(rt_getkey(rt), netmask)) != NULL)) {
+			/* New route for the same destination */
+			if (crt->rt_total >= MAX_PATHS) {
+				error = E2BIG;
+				goto eexist;
+			}
+			if (gateway) {
+			    CLIST_FOREACH(nrt, crt, sentinel, rt_list)
+				if (sockaddr_cmp(nrt->rt_gateway, gateway) == 0)
+				    goto eexist;
+			} else if((rt->rt_flags & RTF_CLONING) &&
+				    (info->rti_ifa)) {
+			    CLIST_FOREACH(nrt, crt, sentinel, rt_list)
+				if (nrt->rt_ifp == info->rti_ifa->ifa_ifp)
+				    goto eexist;
+			} else if(rt->rt_flags & RTF_CLONING)
+			    CLIST_FOREACH(nrt, crt, sentinel, rt_list)
+				if (nrt->rt_ifp == rt->rt_ifp)
+				    goto eexist;
+			sockaddr_free(rt->_rt_key);
+			rt->rt_nodes->rn_mask = crt->rt_nodes->rn_mask;
+			rt->_rt_key = crt->_rt_key;
+			rt->rt_first = crt;
+			CLIST_INSERT_AFTER(crt, rt, rt_list);
+			crt->rt_total++;
+			crt = NULL;
+		} else if (rn == NULL) {
+eexist:
 			IFAFREE(ifa);
 			if ((rt->rt_flags & RTF_CLONED) != 0 && rt->rt_parent)
 				rtfree(rt->rt_parent);
@@ -808,7 +974,10 @@ rtrequest1(int req, struct rt_addrinfo *
 				rtfree(rt->rt_gwroute);
 			rt_destroy(rt);
 			pool_put(&rtentry_pool, rt);
-			senderr(EEXIST);
+			if (error)
+				senderr(error)
+			else
+				senderr(EEXIST);
 		}
 		RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
 		    __LINE__, (void *)rt->_rt_key);
@@ -824,7 +993,8 @@ rtrequest1(int req, struct rt_addrinfo *
 			/* clean up any cloned children */
 			rtflushclone(dst->sa_family, rt);
 		}
-		rtflushall(dst->sa_family);
+		if (crt == NULL)
+			rtflushall(dst->sa_family);
 		break;
 	case RTM_GET:
 		if (netmask != NULL) {
@@ -837,6 +1007,7 @@ rtrequest1(int req, struct rt_addrinfo *
 			senderr(ESRCH);
 		if (ret_nrt != NULL) {
 			rt = (struct rtentry *)rn;
+			rt = rtchoosepath_rr(rt);
 			*ret_nrt = rt;
 			rt->rt_refcnt++;
 		}
@@ -944,8 +1115,12 @@ rtinit(struct ifaddr *ifa, int cmd, int 
 			rt_maskedcopy(odst, dst, ifa->ifa_netmask);
 		}
 		if ((rt = rtalloc1(dst, 0)) != NULL) {
+			struct rtentry *sentinel;
 			rt->rt_refcnt--;
-			if (rt->rt_ifa != ifa)
+			CLIST_FOREACH(rt, rt, sentinel, rt_list)
+				if (rt->rt_ifa->ifa_ifp == ifa->ifa_ifp)
+					break;
+			if (rt == NULL)
 				return (flags & RTF_HOST) ? EHOSTUNREACH
 							: ENETUNREACH;
 		}
Index: sys/net/route.h
===================================================================
RCS file: /cvsroot/src/sys/net/route.h,v
retrieving revision 1.58
diff -u -p -r1.58 route.h
--- sys/net/route.h	27 Aug 2007 00:34:01 -0000	1.58
+++ sys/net/route.h	15 Nov 2007 08:40:23 -0000
@@ -93,6 +93,10 @@ struct rt_metrics {
 #ifndef RNF_NORMAL
 #include <net/radix.h>
 #endif
+
+/* XXX: sysctl maybe ? */
+#define MAX_PATHS 64
+
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 #define	rt_mask(r)	((const struct sockaddr *)((r)->rt_nodes->rn_mask))
@@ -108,7 +112,13 @@ struct rtentry {
 	struct	rtentry *rt_gwroute;	/* implied entry for gatewayed routes */
 	LIST_HEAD(, rttimer) rt_timer;  /* queue of timeouts for misc funcs */
 	struct	rtentry *rt_parent;	/* parent of cloned route */
-	struct sockaddr *_rt_key;
+	struct	sockaddr *_rt_key;
+	/* load-sharing */
+	CLIST_ENTRY(rtentry) rt_list;
+	struct	rtentry *rt_first;	/* First entry in list */
+#define	RTFIRST(r)	((r)->rt_first)
+	struct	rtentry *rt_last;	/* For round robin */
+	uint8_t	rt_total;		/* Number of paths */
 };
 
 static inline const struct sockaddr *
@@ -366,6 +376,7 @@ out:
 }
 
 struct rtentry *rtfindparent(struct radix_node_head *, struct route *);
+struct rtentry *rtnext(struct rtentry *);
 
 #ifdef RTCACHE_DEBUG
 #define	rtcache_init(ro)		rtcache_init_debug(__func__, ro)
@@ -386,6 +397,7 @@ void	rtcache_clear(struct route *);
 void	rtcache_update(struct route *, int);
 void	rtcache_free(struct route *);
 int	rtcache_setdst(struct route *, const struct sockaddr *);
+struct rtentry* rtchoosepath_rr(struct rtentry *);
 
 static inline struct rtentry *
 rtcache_lookup1(struct route *ro, const struct sockaddr *dst, int clone)
Index: sys/net/rtsock.c
===================================================================
RCS file: /cvsroot/src/sys/net/rtsock.c,v
retrieving revision 1.95
diff -u -p -r1.95 rtsock.c
--- sys/net/rtsock.c	19 Jul 2007 20:48:53 -0000	1.95
+++ sys/net/rtsock.c	15 Nov 2007 08:40:23 -0000
@@ -306,7 +306,7 @@ route_output(struct mbuf *m, ...)
 		if (rtm->rtm_type != RTM_GET) {/* XXX: too grotty */
 			struct radix_node *rn;
 
-			if (memcmp(dst, rt_getkey(rt), dst->sa_len) != 0)
+			if (sockaddr_cmp(dst, rt_getkey(rt)) != 0)
 				senderr(ESRCH);
 			netmask = intern_netmask(netmask);
 			for (rn = rt->rt_nodes; rn; rn = rn->rn_dupedkey)
@@ -923,6 +923,8 @@ sysctl_dumpentry(struct rtentry *rt, voi
 	int error = 0, size;
 	struct rt_addrinfo info;
 
+	if (CLIST_NEXT(rt, rt_list) != RTFIRST(rt))
+		sysctl_dumpentry(CLIST_NEXT(rt, rt_list), v);
 	if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
 		return 0;
 	memset(&info, 0, sizeof(info));
Index: sys/netinet/in.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/in.c,v
retrieving revision 1.118
diff -u -p -r1.118 in.c
--- sys/netinet/in.c	1 Sep 2007 04:32:51 -0000	1.118
+++ sys/netinet/in.c	15 Nov 2007 08:40:23 -0000
@@ -987,7 +987,7 @@ bad:
 
 /*
  * add a route to prefix ("connected route" in cisco terminology).
- * does nothing if there's some interface address with the same prefix already.
+ * does nothing if there's same prefix already assigned to the same interface.
  */
 static int
 in_addprefix(struct in_ifaddr *target, int flags)
@@ -1012,14 +1012,11 @@ in_addprefix(struct in_ifaddr *target, i
 			p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
 		}
 
-		if (prefix.s_addr != p.s_addr)
+		if (prefix.s_addr != p.s_addr || target->ia_ifp != ia->ia_ifp)
 			continue;
-
 		/*
-		 * if we got a matching prefix route inserted by other
-		 * interface address, we don't need to bother
-		 *
-		 * XXX RADIX_MPATH implications here? -dyoung
+		 * if we got a matching prefix route inserted on the same
+		 * interface, we don't need to bother
 		 */
 		if (ia->ia_flags & IFA_ROUTE)
 			return 0;
Index: sys/netinet/in.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/in.h,v
retrieving revision 1.81
diff -u -p -r1.81 in.h
--- sys/netinet/in.h	19 Sep 2007 04:33:43 -0000	1.81
+++ sys/netinet/in.h	15 Nov 2007 08:40:23 -0000
@@ -450,8 +450,9 @@ struct ip_mreq {
 #define	IPCTL_IFQ	       21	/* ipintrq node */
 #define	IPCTL_RANDOMID	       22	/* use random IP ids (if configured) */
 #define	IPCTL_LOOPBACKCKSUM    23	/* do IP checksum on loopback */
-#define	IPCTL_STATS		24	/* IP statistics */
-#define	IPCTL_MAXID	       25
+#define	IPCTL_STATS	       24	/* IP statistics */
+#define IPCTL_LOAD_SHARING     25	/* Load sharing */
+#define	IPCTL_MAXID	       26
 
 #define	IPCTL_NAMES { \
 	{ 0, 0 }, \
@@ -479,7 +480,13 @@ struct ip_mreq {
 	{ "random_id", CTLTYPE_INT }, \
 	{ "do_loopback_cksum", CTLTYPE_INT }, \
 	{ "stats", CTLTYPE_STRUCT }, \
+	{ "load-sharing", CTLTYPE_NODE }, \
 }
+
+/* Load sharing */
+#define	IPCTL_LS_SELECTED	1
+#define	IPCTL_LS_AVAILABLE	2
+
 #endif /* _NETBSD_SOURCE */
 
 /* INET6 stuff */
Index: sys/netinet/ip_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_input.c,v
retrieving revision 1.254
diff -u -p -r1.254 ip_input.c
--- sys/netinet/ip_input.c	2 Oct 2007 20:35:04 -0000	1.254
+++ sys/netinet/ip_input.c	15 Nov 2007 08:40:23 -0000
@@ -218,6 +218,13 @@ int	ip_do_randomid = 0;
  */
 int	ip_checkinterface = 0;
 
+#define INITIAL_LS 2
+#define MAX_LS_STRING 20
+
+/* See also defines in ip_output.c if you want to change these */
+const char* load_sharing_strings[] = { "first-only", "round-robin",
+		"simple-sum", NULL };
+int	load_sharing_index = INITIAL_LS;
 
 struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
 
@@ -2163,6 +2170,45 @@ sysctl_net_inet_ip_hashsize(SYSCTLFN_ARG
 }
 #endif /* GATEWAY */
 
+static int
+sysctl_load_sharing(SYSCTLFN_ARGS)
+{
+	int error, i;
+	struct sysctlnode node = *rnode;
+	char lsc[MAX_LS_STRING];
+
+	strlcpy(lsc, load_sharing_strings[load_sharing_index], MAX_LS_STRING);
+	node.sysctl_data = lsc;
+	error = sysctl_lookup(SYSCTLFN_CALL(&node));
+	if (error || newp == NULL)
+		return error;
+	for (i=0; load_sharing_strings[i] != NULL; i++)
+		if (strncmp(load_sharing_strings[i], lsc, MAX_LS_STRING) == 0)
+			break;
+
+	if (load_sharing_strings[i] == NULL)
+		return EINVAL;
+	load_sharing_index = i;
+	return 0;
+}
+
+static int
+sysctl_ls_types(SYSCTLFN_ARGS)
+{
+	struct sysctlnode node = *rnode;
+	int i;
+	char rt[255];
+
+	rt[0]=0;
+	/* XXX: slow and ugly */
+	for (i=0; load_sharing_strings[i] != NULL; i++) {
+		strlcat(rt, load_sharing_strings[i], 255);
+		if (load_sharing_strings[i+1] != NULL)
+			strlcat(rt, " ", 255);
+		}
+	node.sysctl_data = rt;
+	return sysctl_lookup(SYSCTLFN_CALL(&node));
+}
 
 SYSCTL_SETUP(sysctl_net_inet_ip_setup, "sysctl net.inet.ip subtree setup")
 {
@@ -2370,4 +2416,24 @@ SYSCTL_SETUP(sysctl_net_inet_ip_setup, "
 		       NULL, 0, &ipstat, sizeof(ipstat),
 		       CTL_NET, PF_INET, IPPROTO_IP, IPCTL_STATS,
 		       CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT, CTLTYPE_NODE, "load-sharing",
+		       SYSCTL_DESCR("IP load sharing"),
+		       NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP,
+		       IPCTL_LOAD_SHARING, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
+		       CTLTYPE_STRING, "selected",
+		       SYSCTL_DESCR("IP load sharing algorithm"),
+		       sysctl_load_sharing, 0,
+		       &load_sharing_strings[INITIAL_LS],
+		       MAX_LS_STRING - 1,
+		       CTL_NET, PF_INET, IPPROTO_IP,
+		       IPCTL_LOAD_SHARING, IPCTL_LS_SELECTED, CTL_EOL);
+	sysctl_createv(clog, 0, NULL, NULL,
+		       CTLFLAG_PERMANENT, CTLTYPE_STRING, "available",
+		       SYSCTL_DESCR("IP load sharing supported algorithms"),
+		       sysctl_ls_types, 0, NULL, 255, CTL_NET,
+		       PF_INET, IPPROTO_IP, IPCTL_LOAD_SHARING, IPCTL_LS_AVAILABLE,
+		       CTL_EOL);
 }
Index: sys/netinet/ip_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_output.c,v
retrieving revision 1.184
diff -u -p -r1.184 ip_output.c
--- sys/netinet/ip_output.c	19 Sep 2007 04:33:43 -0000	1.184
+++ sys/netinet/ip_output.c	15 Nov 2007 08:40:23 -0000
@@ -171,6 +171,16 @@ int	ip_do_loopback_cksum = 0;
 	(((csum_flags) & M_CSUM_TCPv4) != 0 && tcp_do_loopback_cksum) || \
 	(((csum_flags) & M_CSUM_IPv4) != 0 && ip_do_loopback_cksum)))
 
+/* See also string associations in ip_input.c if you want to change these */
+#define LS_NONE 0
+#define LS_RR   1
+#define LS_SS   2
+
+extern int load_sharing_index;
+
+#define tiny_sum(ip4a) ((ip4a >> 24) + (ip4a << 8 >> 24) + \
+			(ip4a << 16 >> 24) + (ip4a << 24 >> 24))
+
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
@@ -338,13 +348,43 @@ ip_output(struct mbuf *m0, ...)
 		mtu = ifp->if_mtu;
 		IFP_TO_IA(ifp, ia);
 	} else {
-		if (ro->ro_rt == NULL)
+		int ro_cached = 1;
+		if (ro->ro_rt == NULL) {
 			rtcache_init(ro);
+			ro_cached = 0;
+		}
 		if (ro->ro_rt == NULL) {
 			ipstat.ips_noroute++;
 			error = EHOSTUNREACH;
 			goto bad;
 		}
+		/* Load-sharing */
+		if (ro->ro_rt->rt_total > 1 &&
+		    load_sharing_index != LS_NONE &&
+		    !(load_sharing_index == LS_SS && ro_cached)) {
+			ro->ro_rt->rt_refcnt--;
+			switch(load_sharing_index) {
+			    case LS_RR:
+				ro->ro_rt = rtchoosepath_rr(ro->ro_rt);
+				break;
+			    case LS_SS:
+				{
+				uint8_t i, hsh;
+				/* I'm not that happy with this "sum" */
+				hsh = ( tiny_sum(ip->ip_src.s_addr) +
+					tiny_sum(ip->ip_dst.s_addr) +
+					ip->ip_p + ip->ip_tos) %
+						ro->ro_rt->rt_total;
+				/* XXX: Normally it should be up... */
+				if (hsh == 0 && !(ro->ro_rt->rt_flags & RTF_UP))
+					ro->ro_rt = rtnext(ro->ro_rt);
+				else for (i = 0; i < hsh; i++)
+					ro->ro_rt = rtnext(ro->ro_rt);
+				}
+				break;
+			}
+			ro->ro_rt->rt_refcnt++;
+		}
 		ia = ifatoia(ro->ro_rt->rt_ifa);
 		ifp = ro->ro_rt->rt_ifp;
 		if ((mtu = ro->ro_rt->rt_rmx.rmx_mtu) == 0)
Index: sys/sys/queue.h
===================================================================
RCS file: /cvsroot/src/sys/sys/queue.h,v
retrieving revision 1.47
diff -u -p -r1.47 queue.h
--- sys/sys/queue.h	18 Jul 2007 12:07:35 -0000	1.47
+++ sys/sys/queue.h	15 Nov 2007 08:40:23 -0000
@@ -674,4 +674,57 @@ struct {								\
 	    ? ((head)->cqh_last)					\
 	    : (elm->field.cqe_prev))
 
+/*
+ * Circular lists definitions
+ */
+#define	CLIST_ENTRY(__type)		\
+	struct {			\
+		struct __type *cl_next;	\
+		struct __type *cl_prev;	\
+	}
+
+/*
+ * Circular lists functions
+ */
+#define	CLIST_FOREACH1(__elm, __first, __sentinel, __field)	\
+	for ((__elm) = (__sentinel) = (__first); (__elm) != NULL;\
+	     (__elm) = ((__elm)->__field == (__sentinel))	\
+	        ?  NULL						\
+		: (__elm)->__field)
+
+#define	CLIST_FOREACH(__elm, __first, __sentinel, __field)		\
+	CLIST_FOREACH1((__elm), (__first), __sentinel, __field.cl_next)
+
+#define	CLIST_FOREACH_REVERSE(__elm, __first, __sentinel, __field)	\
+	CLIST_FOREACH1((__elm), (__first), __sentinel, __field.cl_prev)
+
+#define	CLIST_INIT(__elm, __field)					\
+	do {								\
+		(__elm)->__field.cl_prev = (__elm)->__field.cl_next =	\
+		    (__elm);		\
+	} while (/*CONSTCOND*/0)
+
+#define	CLIST_SINGULAR(__elm, __field)	((__elm)->__field.cl_prev == (__elm))
+
+#define	CLIST_REMOVE(__elm, __field)				\
+	do {							\
+		(__elm)->__field.cl_prev->__field.cl_next =	\
+		    (__elm)->__field.cl_next;			\
+		(__elm)->__field.cl_next->__field.cl_prev =	\
+		    (__elm)->__field.cl_prev;			\
+		CLIST_INIT((__elm), __field);			\
+	} while (/*CONSTCOND*/0)
+
+#define	CLIST_INSERT_AFTER(__listelm, __elm, __field)			\
+	do {								\
+		assert(__listelm != __elm);				\
+		(__elm)->__field.cl_prev = (__listelm);			\
+		(__elm)->__field.cl_next = (__listelm)->__field.cl_next;\
+		(__listelm)->__field.cl_next = (__elm);			\
+		(__elm)->__field.cl_next->__field.cl_prev = (__elm);	\
+	} while (/*CONSTCOND*/0)
+
+#define	CLIST_NEXT(__elm, __field)	((__elm)->__field.cl_next)
+#define	CLIST_PREV(__elm, __field)	((__elm)->__field.cl_prev)
+
 #endif	/* !_SYS_QUEUE_H_ */
Index: usr.bin/netstat/route.c
===================================================================
RCS file: /cvsroot/src/usr.bin/netstat/route.c,v
retrieving revision 1.69
diff -u -p -r1.69 route.c
--- usr.bin/netstat/route.c	19 Jul 2007 20:51:04 -0000	1.69
+++ usr.bin/netstat/route.c	15 Nov 2007 08:40:23 -0000
@@ -171,6 +171,11 @@ again:
 		} else if (do_rtent) {
 			kget(rn, rtentry);
 			p_krtentry(&rtentry);
+			while ( CLIST_NEXT(&rtentry, rt_list) !=
+			    (struct rtentry*)rn ) {
+				kget(CLIST_NEXT(&rtentry, rt_list), rtentry);
+				p_krtentry(&rtentry);
+			}
 			if (Aflag)
 				p_rtnode();
 		} else {

--Boundary-00=_plAPHj0PYRgIqff--