Subject: load sharing
To: None <tech-net@netbsd.org>
From: Mihai Chelaru <kefren@ngnetworks.ro>
List: tech-net
Date: 11/15/2007 10:55:05
--Boundary-00=_plAPHj0PYRgIqff
Content-Type: text/plain;
charset="us-ascii"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline
Hi,
Attached it's a patch that allows load sharing over multiple gateways. Right
now it supports simple ways to do it, controllable via
net.inet.ip.load-sharing sysctl. Although I've only written the IP part,
implementing it for other protocols is very simple (currently I have nothing
else than IP here to test).
Also, right now it supports adding cloning routes with the same destination to
different interfaces, meaning you can link two cards to the _same_ ethernet
domain. ARP resolving in this case will work round-robin but this is subject
to future changes that will allow having two different cards connected to
different ethernet domains but using the same IP subnet.
Testers and opinions are highly appreciated :)
--
Mihai
P.S. Please CC me
--Boundary-00=_plAPHj0PYRgIqff
Content-Type: text/x-diff;
charset="us-ascii";
name="loadsharing-noarp.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
filename="loadsharing-noarp.patch"
Index: sys/net/route.c
===================================================================
RCS file: /cvsroot/src/sys/net/route.c,v
retrieving revision 1.98
diff -u -p -r1.98 route.c
--- sys/net/route.c 10 Oct 2007 22:14:38 -0000 1.98
+++ sys/net/route.c 15 Nov 2007 08:40:23 -0000
@@ -147,6 +147,7 @@ struct callout rt_timer_ch; /* callout f
static int _rtcache_debug = 0;
#endif /* RTFLUSH_DEBUG */
+struct rtentry *rtgethead(const struct sockaddr *, const struct sockaddr *);
static int rtdeletemsg(struct rtentry *);
static int rtflushclone1(struct rtentry *, void *);
static void rtflushclone(sa_family_t family, struct rtentry *);
@@ -304,6 +305,34 @@ rtalloc(struct route *ro)
rtcache(ro);
}
+/*
+ * Returns rtentry in a RR fashion
+ * rt should be the first path
+ */
+struct rtentry *
+rtchoosepath_rr(struct rtentry *rt)
+{
+ rt->rt_last = rtnext(rt->rt_last);
+ return rt->rt_last;
+}
+
+/*
+ * Next rtentry that it's UP (in case there is such thing)
+ * If none is found return the feeded rtentry
+ */
+struct rtentry *
+rtnext(struct rtentry *rt)
+{
+ struct rtentry *retrt, *sentinel;
+
+ KASSERT(rt != NULL);
+ CLIST_FOREACH(retrt, CLIST_NEXT(rt, rt_list), sentinel, rt_list)
+ if (retrt->rt_flags & RTF_UP)
+ return retrt;
+
+ return rt;
+}
+
struct rtentry *
rtalloc1(const struct sockaddr *dst, int report)
{
@@ -355,28 +384,81 @@ rtalloc1(const struct sockaddr *dst, int
return newrt;
}
+/*
+ * returns head of the list
+ * just a rnh_lookup wrapper
+ */
+struct rtentry *
+rtgethead(const struct sockaddr *dst, const struct sockaddr *netmask)
+{
+ struct radix_node_head *rnh = rt_tables[dst->sa_family];
+ struct rtentry *rt = NULL;
+ struct radix_node *rn;
+ int s = splsoftnet();
+
+ if (rnh && (rn = rnh->rnh_lookup(dst, netmask, rnh)) &&
+ ((rn->rn_flags & RNF_ROOT) == 0))
+ rt = (struct rtentry *)rn;
+ else
+ rtstat.rts_unreach++;
+
+ splx(s);
+ return rt;
+}
+
void
rtfree(struct rtentry *rt)
{
- struct ifaddr *ifa;
+ struct rtentry *rthead;
if (rt == NULL)
panic("rtfree");
rt->rt_refcnt--;
if (rt->rt_refcnt <= 0 && (rt->rt_flags & RTF_UP) == 0) {
- if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
- panic ("rtfree 2");
rttrash--;
if (rt->rt_refcnt < 0) {
printf("rtfree: %p not freed (neg refs)\n", rt);
return;
}
+ rthead = RTFIRST(rt);
+ rthead->rt_total--;
+ if (rthead->rt_total == 0 &&
+ (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT)))
+ panic("rtfree 2");
rt_timer_remove_all(rt, 0);
- ifa = rt->rt_ifa;
- rt->rt_ifa = NULL;
- IFAFREE(ifa);
- rt->rt_ifp = NULL;
- rt_destroy(rt);
+ IFAFREE(rt->rt_ifa);
+ if (rthead->rt_total == 0) {
+ /* No other paths */
+ rt_destroy(rt);
+ } else if (rthead == rt) {
+ /* First GW to delete from more */
+ struct radix_node_head *rnh;
+ struct rtentry *srt = CLIST_NEXT(rthead, rt_list),
+ *sen, *rtin;
+ KASSERT(rt != srt);
+ srt->rt_total = rt->rt_total;
+ srt->rt_last = srt;
+ CLIST_REMOVE(rt, rt_list);
+ if ((rnh = rt_tables[rt_getkey(rt)->sa_family]) == NULL)
+ panic("rtfree: rt_tables");
+ if (rnh->rnh_deladdr(rt_getkey(rt), rt_mask(rt), rnh) == NULL)
+ panic("rtfree: deladdr");
+ if (rnh->rnh_addaddr(rt_getkey(srt), rt_mask(srt), rnh,
+ srt->rt_nodes) == NULL)
+ panic("rtfree: addaddr");
+ CLIST_FOREACH(rtin, srt, sen, rt_list)
+ RTFIRST(rtin) = srt;
+ } else {
+ /* Delete a non-first path */
+ CLIST_REMOVE(rt, rt_list);
+ if (rthead->rt_last == rt)
+ rthead->rt_last = rthead;
+ }
+
+ if (rt->rt_gateway != NULL)
+ sockaddr_free(rt->rt_gateway);
+ /* do I really need this ? I also Bzero at pool_get */
+ Bzero(rt, sizeof(*rt));
pool_put(&rtentry_pool, rt);
}
}
@@ -427,20 +509,33 @@ rtredirect(const struct sockaddr *dst, c
error = ENETUNREACH;
goto out;
}
- rt = rtalloc1(dst, 0);
/*
- * If the redirect isn't from our current router for this dst,
- * it's either old or wrong. If it redirects us to ourselves,
- * we have a routing loop, perhaps as a result of an interface
- * going down recently.
+ * If it redirects us to ourselves we have a routing loop,
+ * perhaps as a result of an interface going down recently.
*/
- if (!(flags & RTF_DONE) && rt &&
- (!equal(src, rt->rt_gateway) || rt->rt_ifa != ifa))
- error = EINVAL;
- else if (ifa_ifwithaddr(gateway))
+ if (ifa_ifwithaddr(gateway)) {
error = EHOSTUNREACH;
- if (error)
- goto done;
+ goto out;
+ }
+ rt = rtalloc1(dst, 0);
+ if (rt && !(flags & RTF_DONE)) {
+ /*
+ * If the redirect isn't from our current router for this dst,
+ * it's either old or wrong. Also calibrate rt.
+ */
+ struct rtentry *sentinel, *nrt;
+ CLIST_FOREACH(nrt, rt, sentinel, rt_list)
+ if(equal(src, nrt->rt_gateway) && (nrt->rt_ifa == ifa))
+ break;
+ if(nrt == NULL) {
+ error = EINVAL;
+ goto done;
+ }
+ rt->rt_refcnt--;
+ nrt->rt_refcnt++;
+ rt = nrt;
+ }
+
/*
* Create a new entry if we just got back a wildcard entry
* or the lookup failed. This is necessary for hosts
@@ -485,6 +580,7 @@ rtredirect(const struct sockaddr *dst, c
}
} else
error = EHOSTUNREACH;
+
done:
if (rt) {
if (rtp != NULL && !error)
@@ -674,7 +770,7 @@ rtrequest1(int req, struct rt_addrinfo *
{
int s = splsoftnet();
int error = 0;
- struct rtentry *rt, *crt;
+ struct rtentry *rt, *crt = NULL, *sentinel, *nrt;
struct radix_node *rn;
struct radix_node_head *rnh;
struct ifaddr *ifa;
@@ -698,16 +794,45 @@ rtrequest1(int req, struct rt_addrinfo *
}
if ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL)
senderr(ESRCH);
- rt = (struct rtentry *)rn;
+ crt = rt = (struct rtentry *)rn;
+ /* Calibrate */
+ if (gateway != NULL && !(crt->rt_flags & RTF_CLONING)) {
+ /*
+ * XXX: we can have a gateway on cloning route
+ */
+ CLIST_FOREACH(rt, crt, sentinel, rt_list)
+ if (sockaddr_cmp(gateway, rt->rt_gateway) == 0)
+ break;
+ if (rt == NULL)
+ senderr(ESRCH);
+ } else
+ if (! CLIST_SINGULAR(crt, rt_list)) {
+ /*
+ * If gateway is not provided when
+ * multiple paths exist check if it's a cloning
+ * route and try to match ifp
+ */
+ if ( (crt->rt_flags & RTF_CLONING) == 0 ||
+ !(info->rti_ifa))
+ senderr(EINVAL);
+ CLIST_FOREACH(rt, crt, sentinel, rt_list)
+ if (rt->rt_ifp == info->rti_ifa->ifa_ifp)
+ break;
+ if (rt == NULL)
+ senderr(EINVAL);
+ }
+ if (CLIST_SINGULAR(rt, rt_list)) {
+ if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL)
+ senderr(ESRCH);
+ if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
+ panic("rtrequest delete");
+ }
if ((rt->rt_flags & RTF_CLONING) != 0) {
/* clean up any cloned children */
rtflushclone(dst->sa_family, rt);
}
- if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL)
- senderr(ESRCH);
- if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
- panic ("rtrequest delete");
- rt = (struct rtentry *)rn;
+ if (rt->rt_nodes->rn_flags & RNF_ROOT)
+ panic("rtrequest delete 2");
if (rt->rt_gwroute) {
RTFREE(rt->rt_gwroute);
rt->rt_gwroute = NULL;
@@ -733,6 +858,13 @@ rtrequest1(int req, struct rt_addrinfo *
senderr(EINVAL);
if ((rt->rt_flags & RTF_CLONING) == 0)
senderr(EINVAL);
+ /*
+ * See if we have more than one cloning route
+ * and use them round-robinly
+ * XXX: this will change
+ */
+ if (!CLIST_SINGULAR(rt, rt_list))
+ rt = rtchoosepath_rr(rt);
ifa = rt->rt_ifa;
flags = rt->rt_flags & ~(RTF_CLONING | RTF_STATIC);
flags |= RTF_CLONED;
@@ -781,26 +913,60 @@ rtrequest1(int req, struct rt_addrinfo *
rt->rt_parent = *ret_nrt;
rt->rt_parent->rt_refcnt++;
}
+ rt->rt_total = 1;
+ rt->rt_first = rt;
+ rt->rt_last = rt;
+ CLIST_INIT(rt, rt_list);
RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
__LINE__, (void *)rt->_rt_key);
rn = rnh->rnh_addaddr(rt_getkey(rt), netmask, rnh,
rt->rt_nodes);
RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
__LINE__, (void *)rt->_rt_key);
- if (rn == NULL && (crt = rtalloc1(rt_getkey(rt), 0)) != NULL) {
+ if (rn == NULL &&
+ ((crt = rtgethead(rt_getkey(rt), NULL)) != NULL) &&
/* overwrite cloned route */
- if ((crt->rt_flags & RTF_CLONED) != 0) {
- rtdeletemsg(crt);
- rn = rnh->rnh_addaddr(rt_getkey(rt),
- netmask, rnh, rt->rt_nodes);
+ ((crt->rt_flags & RTF_CLONED) != 0)) {
+ rtdeletemsg(crt);
+ rn = rnh->rnh_addaddr(rt_getkey(rt),
+ netmask, rnh, rt->rt_nodes);
+ crt = NULL;
+ if (rn == NULL) {
+ error = ENOMEM;
+ goto eexist;
}
- RTFREE(crt);
RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
- __LINE__, (void *)rt->_rt_key);
+ __LINE__, (void *)rt->_rt_key);
}
- RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
- __LINE__, (void *)rt->_rt_key);
- if (rn == NULL) {
+ else if (req == RTM_ADD && rn == NULL &&
+ ((crt = rtgethead(rt_getkey(rt), netmask)) != NULL)) {
+ /* New route for the same destination */
+ if (crt->rt_total >= MAX_PATHS) {
+ error = E2BIG;
+ goto eexist;
+ }
+ if (gateway) {
+ CLIST_FOREACH(nrt, crt, sentinel, rt_list)
+ if (sockaddr_cmp(nrt->rt_gateway, gateway) == 0)
+ goto eexist;
+ } else if((rt->rt_flags & RTF_CLONING) &&
+ (info->rti_ifa)) {
+ CLIST_FOREACH(nrt, crt, sentinel, rt_list)
+ if (nrt->rt_ifp == info->rti_ifa->ifa_ifp)
+ goto eexist;
+ } else if(rt->rt_flags & RTF_CLONING)
+ CLIST_FOREACH(nrt, crt, sentinel, rt_list)
+ if (nrt->rt_ifp == rt->rt_ifp)
+ goto eexist;
+ sockaddr_free(rt->_rt_key);
+ rt->rt_nodes->rn_mask = crt->rt_nodes->rn_mask;
+ rt->_rt_key = crt->_rt_key;
+ rt->rt_first = crt;
+ CLIST_INSERT_AFTER(crt, rt, rt_list);
+ crt->rt_total++;
+ crt = NULL;
+ } else if (rn == NULL) {
+eexist:
IFAFREE(ifa);
if ((rt->rt_flags & RTF_CLONED) != 0 && rt->rt_parent)
rtfree(rt->rt_parent);
@@ -808,7 +974,10 @@ rtrequest1(int req, struct rt_addrinfo *
rtfree(rt->rt_gwroute);
rt_destroy(rt);
pool_put(&rtentry_pool, rt);
- senderr(EEXIST);
+ if (error)
+ senderr(error)
+ else
+ senderr(EEXIST);
}
RT_DPRINTF("%s l.%d: rt->_rt_key = %p\n", __func__,
__LINE__, (void *)rt->_rt_key);
@@ -824,7 +993,8 @@ rtrequest1(int req, struct rt_addrinfo *
/* clean up any cloned children */
rtflushclone(dst->sa_family, rt);
}
- rtflushall(dst->sa_family);
+ if (crt == NULL)
+ rtflushall(dst->sa_family);
break;
case RTM_GET:
if (netmask != NULL) {
@@ -837,6 +1007,7 @@ rtrequest1(int req, struct rt_addrinfo *
senderr(ESRCH);
if (ret_nrt != NULL) {
rt = (struct rtentry *)rn;
+ rt = rtchoosepath_rr(rt);
*ret_nrt = rt;
rt->rt_refcnt++;
}
@@ -944,8 +1115,12 @@ rtinit(struct ifaddr *ifa, int cmd, int
rt_maskedcopy(odst, dst, ifa->ifa_netmask);
}
if ((rt = rtalloc1(dst, 0)) != NULL) {
+ struct rtentry *sentinel;
rt->rt_refcnt--;
- if (rt->rt_ifa != ifa)
+ CLIST_FOREACH(rt, rt, sentinel, rt_list)
+ if (rt->rt_ifa->ifa_ifp == ifa->ifa_ifp)
+ break;
+ if (rt == NULL)
return (flags & RTF_HOST) ? EHOSTUNREACH
: ENETUNREACH;
}
Index: sys/net/route.h
===================================================================
RCS file: /cvsroot/src/sys/net/route.h,v
retrieving revision 1.58
diff -u -p -r1.58 route.h
--- sys/net/route.h 27 Aug 2007 00:34:01 -0000 1.58
+++ sys/net/route.h 15 Nov 2007 08:40:23 -0000
@@ -93,6 +93,10 @@ struct rt_metrics {
#ifndef RNF_NORMAL
#include <net/radix.h>
#endif
+
+/* XXX: sysctl maybe ? */
+#define MAX_PATHS 64
+
struct rtentry {
struct radix_node rt_nodes[2]; /* tree glue, and other values */
#define rt_mask(r) ((const struct sockaddr *)((r)->rt_nodes->rn_mask))
@@ -108,7 +112,13 @@ struct rtentry {
struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */
LIST_HEAD(, rttimer) rt_timer; /* queue of timeouts for misc funcs */
struct rtentry *rt_parent; /* parent of cloned route */
- struct sockaddr *_rt_key;
+ struct sockaddr *_rt_key;
+ /* load-sharing */
+ CLIST_ENTRY(rtentry) rt_list;
+ struct rtentry *rt_first; /* First entry in list */
+#define RTFIRST(r) ((r)->rt_first)
+ struct rtentry *rt_last; /* For round robin */
+ uint8_t rt_total; /* Number of paths */
};
static inline const struct sockaddr *
@@ -366,6 +376,7 @@ out:
}
struct rtentry *rtfindparent(struct radix_node_head *, struct route *);
+struct rtentry *rtnext(struct rtentry *);
#ifdef RTCACHE_DEBUG
#define rtcache_init(ro) rtcache_init_debug(__func__, ro)
@@ -386,6 +397,7 @@ void rtcache_clear(struct route *);
void rtcache_update(struct route *, int);
void rtcache_free(struct route *);
int rtcache_setdst(struct route *, const struct sockaddr *);
+struct rtentry* rtchoosepath_rr(struct rtentry *);
static inline struct rtentry *
rtcache_lookup1(struct route *ro, const struct sockaddr *dst, int clone)
Index: sys/net/rtsock.c
===================================================================
RCS file: /cvsroot/src/sys/net/rtsock.c,v
retrieving revision 1.95
diff -u -p -r1.95 rtsock.c
--- sys/net/rtsock.c 19 Jul 2007 20:48:53 -0000 1.95
+++ sys/net/rtsock.c 15 Nov 2007 08:40:23 -0000
@@ -306,7 +306,7 @@ route_output(struct mbuf *m, ...)
if (rtm->rtm_type != RTM_GET) {/* XXX: too grotty */
struct radix_node *rn;
- if (memcmp(dst, rt_getkey(rt), dst->sa_len) != 0)
+ if (sockaddr_cmp(dst, rt_getkey(rt)) != 0)
senderr(ESRCH);
netmask = intern_netmask(netmask);
for (rn = rt->rt_nodes; rn; rn = rn->rn_dupedkey)
@@ -923,6 +923,8 @@ sysctl_dumpentry(struct rtentry *rt, voi
int error = 0, size;
struct rt_addrinfo info;
+ if (CLIST_NEXT(rt, rt_list) != RTFIRST(rt))
+ sysctl_dumpentry(CLIST_NEXT(rt, rt_list), v);
if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
return 0;
memset(&info, 0, sizeof(info));
Index: sys/netinet/in.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/in.c,v
retrieving revision 1.118
diff -u -p -r1.118 in.c
--- sys/netinet/in.c 1 Sep 2007 04:32:51 -0000 1.118
+++ sys/netinet/in.c 15 Nov 2007 08:40:23 -0000
@@ -987,7 +987,7 @@ bad:
/*
* add a route to prefix ("connected route" in cisco terminology).
- * does nothing if there's some interface address with the same prefix already.
+ * does nothing if there's same prefix already assigned to the same interface.
*/
static int
in_addprefix(struct in_ifaddr *target, int flags)
@@ -1012,14 +1012,11 @@ in_addprefix(struct in_ifaddr *target, i
p.s_addr &= ia->ia_sockmask.sin_addr.s_addr;
}
- if (prefix.s_addr != p.s_addr)
+ if (prefix.s_addr != p.s_addr || target->ia_ifp != ia->ia_ifp)
continue;
-
/*
- * if we got a matching prefix route inserted by other
- * interface address, we don't need to bother
- *
- * XXX RADIX_MPATH implications here? -dyoung
+ * if we got a matching prefix route inserted on the same
+ * interface, we don't need to bother
*/
if (ia->ia_flags & IFA_ROUTE)
return 0;
Index: sys/netinet/in.h
===================================================================
RCS file: /cvsroot/src/sys/netinet/in.h,v
retrieving revision 1.81
diff -u -p -r1.81 in.h
--- sys/netinet/in.h 19 Sep 2007 04:33:43 -0000 1.81
+++ sys/netinet/in.h 15 Nov 2007 08:40:23 -0000
@@ -450,8 +450,9 @@ struct ip_mreq {
#define IPCTL_IFQ 21 /* ipintrq node */
#define IPCTL_RANDOMID 22 /* use random IP ids (if configured) */
#define IPCTL_LOOPBACKCKSUM 23 /* do IP checksum on loopback */
-#define IPCTL_STATS 24 /* IP statistics */
-#define IPCTL_MAXID 25
+#define IPCTL_STATS 24 /* IP statistics */
+#define IPCTL_LOAD_SHARING 25 /* Load sharing */
+#define IPCTL_MAXID 26
#define IPCTL_NAMES { \
{ 0, 0 }, \
@@ -479,7 +480,13 @@ struct ip_mreq {
{ "random_id", CTLTYPE_INT }, \
{ "do_loopback_cksum", CTLTYPE_INT }, \
{ "stats", CTLTYPE_STRUCT }, \
+ { "load-sharing", CTLTYPE_NODE }, \
}
+
+/* Load sharing */
+#define IPCTL_LS_SELECTED 1
+#define IPCTL_LS_AVAILABLE 2
+
#endif /* _NETBSD_SOURCE */
/* INET6 stuff */
Index: sys/netinet/ip_input.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_input.c,v
retrieving revision 1.254
diff -u -p -r1.254 ip_input.c
--- sys/netinet/ip_input.c 2 Oct 2007 20:35:04 -0000 1.254
+++ sys/netinet/ip_input.c 15 Nov 2007 08:40:23 -0000
@@ -218,6 +218,13 @@ int ip_do_randomid = 0;
*/
int ip_checkinterface = 0;
+#define INITIAL_LS 2
+#define MAX_LS_STRING 20
+
+/* See also defines in ip_output.c if you want to change these */
+const char* load_sharing_strings[] = { "first-only", "round-robin",
+ "simple-sum", NULL };
+int load_sharing_index = INITIAL_LS;
struct rttimer_queue *ip_mtudisc_timeout_q = NULL;
@@ -2163,6 +2170,45 @@ sysctl_net_inet_ip_hashsize(SYSCTLFN_ARG
}
#endif /* GATEWAY */
+static int
+sysctl_load_sharing(SYSCTLFN_ARGS)
+{
+ int error, i;
+ struct sysctlnode node = *rnode;
+ char lsc[MAX_LS_STRING];
+
+ strlcpy(lsc, load_sharing_strings[load_sharing_index], MAX_LS_STRING);
+ node.sysctl_data = lsc;
+ error = sysctl_lookup(SYSCTLFN_CALL(&node));
+ if (error || newp == NULL)
+ return error;
+ for (i=0; load_sharing_strings[i] != NULL; i++)
+ if (strncmp(load_sharing_strings[i], lsc, MAX_LS_STRING) == 0)
+ break;
+
+ if (load_sharing_strings[i] == NULL)
+ return EINVAL;
+ load_sharing_index = i;
+ return 0;
+}
+
+static int
+sysctl_ls_types(SYSCTLFN_ARGS)
+{
+ struct sysctlnode node = *rnode;
+ int i;
+ char rt[255];
+
+ rt[0]=0;
+ /* XXX: slow and ugly */
+ for (i=0; load_sharing_strings[i] != NULL; i++) {
+ strlcat(rt, load_sharing_strings[i], 255);
+ if (load_sharing_strings[i+1] != NULL)
+ strlcat(rt, " ", 255);
+ }
+ node.sysctl_data = rt;
+ return sysctl_lookup(SYSCTLFN_CALL(&node));
+}
SYSCTL_SETUP(sysctl_net_inet_ip_setup, "sysctl net.inet.ip subtree setup")
{
@@ -2370,4 +2416,24 @@ SYSCTL_SETUP(sysctl_net_inet_ip_setup, "
NULL, 0, &ipstat, sizeof(ipstat),
CTL_NET, PF_INET, IPPROTO_IP, IPCTL_STATS,
CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT, CTLTYPE_NODE, "load-sharing",
+ SYSCTL_DESCR("IP load sharing"),
+ NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP,
+ IPCTL_LOAD_SHARING, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
+ CTLTYPE_STRING, "selected",
+ SYSCTL_DESCR("IP load sharing algorithm"),
+ sysctl_load_sharing, 0,
+ &load_sharing_strings[INITIAL_LS],
+ MAX_LS_STRING - 1,
+ CTL_NET, PF_INET, IPPROTO_IP,
+ IPCTL_LOAD_SHARING, IPCTL_LS_SELECTED, CTL_EOL);
+ sysctl_createv(clog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT, CTLTYPE_STRING, "available",
+ SYSCTL_DESCR("IP load sharing supported algorithms"),
+ sysctl_ls_types, 0, NULL, 255, CTL_NET,
+ PF_INET, IPPROTO_IP, IPCTL_LOAD_SHARING, IPCTL_LS_AVAILABLE,
+ CTL_EOL);
}
Index: sys/netinet/ip_output.c
===================================================================
RCS file: /cvsroot/src/sys/netinet/ip_output.c,v
retrieving revision 1.184
diff -u -p -r1.184 ip_output.c
--- sys/netinet/ip_output.c 19 Sep 2007 04:33:43 -0000 1.184
+++ sys/netinet/ip_output.c 15 Nov 2007 08:40:23 -0000
@@ -171,6 +171,16 @@ int ip_do_loopback_cksum = 0;
(((csum_flags) & M_CSUM_TCPv4) != 0 && tcp_do_loopback_cksum) || \
(((csum_flags) & M_CSUM_IPv4) != 0 && ip_do_loopback_cksum)))
+/* See also string associations in ip_input.c if you want to change these */
+#define LS_NONE 0
+#define LS_RR 1
+#define LS_SS 2
+
+extern int load_sharing_index;
+
+#define tiny_sum(ip4a) ((ip4a >> 24) + (ip4a << 8 >> 24) + \
+ (ip4a << 16 >> 24) + (ip4a << 24 >> 24))
+
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
@@ -338,13 +348,43 @@ ip_output(struct mbuf *m0, ...)
mtu = ifp->if_mtu;
IFP_TO_IA(ifp, ia);
} else {
- if (ro->ro_rt == NULL)
+ int ro_cached = 1;
+ if (ro->ro_rt == NULL) {
rtcache_init(ro);
+ ro_cached = 0;
+ }
if (ro->ro_rt == NULL) {
ipstat.ips_noroute++;
error = EHOSTUNREACH;
goto bad;
}
+ /* Load-sharing */
+ if (ro->ro_rt->rt_total > 1 &&
+ load_sharing_index != LS_NONE &&
+ !(load_sharing_index == LS_SS && ro_cached)) {
+ ro->ro_rt->rt_refcnt--;
+ switch(load_sharing_index) {
+ case LS_RR:
+ ro->ro_rt = rtchoosepath_rr(ro->ro_rt);
+ break;
+ case LS_SS:
+ {
+ uint8_t i, hsh;
+ /* I'm not that happy with this "sum" */
+ hsh = ( tiny_sum(ip->ip_src.s_addr) +
+ tiny_sum(ip->ip_dst.s_addr) +
+ ip->ip_p + ip->ip_tos) %
+ ro->ro_rt->rt_total;
+ /* XXX: Normally it should be up... */
+ if (hsh == 0 && !(ro->ro_rt->rt_flags & RTF_UP))
+ ro->ro_rt = rtnext(ro->ro_rt);
+ else for (i = 0; i < hsh; i++)
+ ro->ro_rt = rtnext(ro->ro_rt);
+ }
+ break;
+ }
+ ro->ro_rt->rt_refcnt++;
+ }
ia = ifatoia(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
if ((mtu = ro->ro_rt->rt_rmx.rmx_mtu) == 0)
Index: sys/sys/queue.h
===================================================================
RCS file: /cvsroot/src/sys/sys/queue.h,v
retrieving revision 1.47
diff -u -p -r1.47 queue.h
--- sys/sys/queue.h 18 Jul 2007 12:07:35 -0000 1.47
+++ sys/sys/queue.h 15 Nov 2007 08:40:23 -0000
@@ -674,4 +674,57 @@ struct { \
? ((head)->cqh_last) \
: (elm->field.cqe_prev))
+/*
+ * Circular lists definitions
+ */
+#define CLIST_ENTRY(__type) \
+ struct { \
+ struct __type *cl_next; \
+ struct __type *cl_prev; \
+ }
+
+/*
+ * Circular lists functions
+ */
+#define CLIST_FOREACH1(__elm, __first, __sentinel, __field) \
+ for ((__elm) = (__sentinel) = (__first); (__elm) != NULL;\
+ (__elm) = ((__elm)->__field == (__sentinel)) \
+ ? NULL \
+ : (__elm)->__field)
+
+#define CLIST_FOREACH(__elm, __first, __sentinel, __field) \
+ CLIST_FOREACH1((__elm), (__first), __sentinel, __field.cl_next)
+
+#define CLIST_FOREACH_REVERSE(__elm, __first, __sentinel, __field) \
+ CLIST_FOREACH1((__elm), (__first), __sentinel, __field.cl_prev)
+
+#define CLIST_INIT(__elm, __field) \
+ do { \
+ (__elm)->__field.cl_prev = (__elm)->__field.cl_next = \
+ (__elm); \
+ } while (/*CONSTCOND*/0)
+
+#define CLIST_SINGULAR(__elm, __field) ((__elm)->__field.cl_prev == (__elm))
+
+#define CLIST_REMOVE(__elm, __field) \
+ do { \
+ (__elm)->__field.cl_prev->__field.cl_next = \
+ (__elm)->__field.cl_next; \
+ (__elm)->__field.cl_next->__field.cl_prev = \
+ (__elm)->__field.cl_prev; \
+ CLIST_INIT((__elm), __field); \
+ } while (/*CONSTCOND*/0)
+
+#define CLIST_INSERT_AFTER(__listelm, __elm, __field) \
+ do { \
+ assert(__listelm != __elm); \
+ (__elm)->__field.cl_prev = (__listelm); \
+ (__elm)->__field.cl_next = (__listelm)->__field.cl_next;\
+ (__listelm)->__field.cl_next = (__elm); \
+ (__elm)->__field.cl_next->__field.cl_prev = (__elm); \
+ } while (/*CONSTCOND*/0)
+
+#define CLIST_NEXT(__elm, __field) ((__elm)->__field.cl_next)
+#define CLIST_PREV(__elm, __field) ((__elm)->__field.cl_prev)
+
#endif /* !_SYS_QUEUE_H_ */
Index: usr.bin/netstat/route.c
===================================================================
RCS file: /cvsroot/src/usr.bin/netstat/route.c,v
retrieving revision 1.69
diff -u -p -r1.69 route.c
--- usr.bin/netstat/route.c 19 Jul 2007 20:51:04 -0000 1.69
+++ usr.bin/netstat/route.c 15 Nov 2007 08:40:23 -0000
@@ -171,6 +171,11 @@ again:
} else if (do_rtent) {
kget(rn, rtentry);
p_krtentry(&rtentry);
+ while ( CLIST_NEXT(&rtentry, rt_list) !=
+ (struct rtentry*)rn ) {
+ kget(CLIST_NEXT(&rtentry, rt_list), rtentry);
+ p_krtentry(&rtentry);
+ }
if (Aflag)
p_rtnode();
} else {
--Boundary-00=_plAPHj0PYRgIqff--