Subject: Sample splice(2) via kcont(9), work-in-progress towards sendfile()
To: None <tech-net@netbsd.org>
From: Jonathan Stone <jonathan@dsg.stanford.edu>
List: tech-net
Date: 10/05/2005 09:37:37
Below is a rough first-pass first cut at splice(2), based on the the
API from Kevin Fall and Sally Floyd's work:
Sally Floyd and Kevin Fall.
Promoting the use of end-to-end congestion control in the Internet.
ACM/IEEE Transactions on Networking, 7(4):458-473, August 1999
The patch uses kcont(9) to stitch together the data from two TCP PCBs.
I've used the patch with a trivial userspace program to forward a
wire-speed gigabit ttcp stream, with the ttcp sender running on
machine A, the ttcp receiver on machine C, and machine B in the middle
using splice(2) to TCP-terminate A's traffic and forward on a separate
connection to machine C;
ttcp ttcp
sender receiver
A --------> B --------> C
in-kernel
TCP splice
Before any thought of committing, I'd want to rework the API, to split
the overloaded length/flags arguments into two separate arguments: an
int for flags, and a u_long or off_t for the size to copy. (though
the length was, historically, for UDP/record traffic...)
Oh, and the syscall numbers are for illustration only.
While that's interesting to me, as a prototype testbed for (for
example) 10GbE NICs, my guess is that implementin sendfile() using a
similar kcont scheme would be more interesting to a wider audience.
I'm sure the usual supsects will quickly notice that the patch
supports TCP only. I'm also interested in reworking the
protocol-level APIs to allow a more protocol/PF-neutral interface for
layering upper-level protocols on top of in-kernel APIs, but below the
*userspace* socket API.
Other comments welcomed.
Index: kern/syscalls.master
===================================================================
RCS file: /cvsroot/src/sys/kern/syscalls.master,v
retrieving revision 1.145
diff -w -u -r1.145 syscalls.master
--- kern/syscalls.master 25 Feb 2005 19:53:56 -0000 1.145
+++ kern/syscalls.master 1 Sep 2005 23:55:20 -0000
@@ -751,3 +751,6 @@
const sigset_t *mask); }
374 STD { int sys_pollts(struct pollfd *fds, u_int nfds, \
const struct timespec *ts, const sigset_t *mask); }
+375 STD { int sys_splice(int fd1, int fd2); }
+376 STD { int sys_sendfile(int fd, int s, off_t offset, size_t nbytes, \
+ void *hdtr, off_t *sbytes, int flags); }
--- /dev/null 2005-09-07 17:06:10.000000000 -0700
+++ sys_splice.c 2005-06-06 13:18:03.000000000 -0700
@@ -0,0 +1,378 @@
+
+/*
+ * Copyright 2005 Jonathan Stone.
+ * All rights reserved.
+ */
+#include <sys/syscallargs.h>
+#include <sys/kcont.h>
+
+const char spliceio[] = "spliceio";
+
+int sys_splice(struct lwp *l, void *v, register_t *retval);
+int splice1(struct socket *s1, struct socket *s2, struct proc *p);
+int splice_movedata(struct socket *from, struct socket *to,
+ struct proc *p);
+void splice_sowake_tramp(struct socket *, caddr_t, int);
+void splice_sowake_continuation(void *, void *, int);
+
+typedef struct splice_state {
+
+ /* Address of this struct, for userspace syscall to
+ * wait upon for completion. But really for debugging.
+ */
+ struct splice_state * splc_self;
+
+ /* thought: make these an explicit array? */
+ struct socket *splc_so1, *splc_so2;
+
+ struct proc * splc_proc;
+
+ /* Error (if any) returned to userspace upon completion. */
+ int splc_error;
+
+ /* Spliced socket state: */
+
+
+ /* kcont(s). again, perhaps make these an explicit array? */
+
+ struct kc so1_kc;
+ struct kc so2_kc;
+
+} splice_state;
+
+
+
+/*
+ * Once data is ready to read, move that data from one socket's
+ * receive queue to the other socket's send queue. If the send
+ * socket has insufficient space, silently give up: we will try again
+ * later when sowwakeup() is called on the sending socket.
+ */
+int
+splice_movedata(struct socket *from, struct socket *to,
+ struct proc *p)
+{
+ long space, sspace;
+ struct mbuf *top;
+ struct uio uio;
+ int error, rcvflags;
+ int s;
+
+ /*
+ * Compute min(send space, available data).
+ * XXX: needs a rework for sockets with record boundaries
+ * (SOCK_DGRAM, SOCK_SEQPACKET).
+ */
+ space = sbspace(&to->so_snd);
+ sspace = from->so_rcv.sb_cc;
+ if (sspace < space)
+ space = sspace;
+
+ if (space == 0)
+ return 0;
+
+ if (space < 0) {
+ printf("splice: negative space %ld from-q %ld to-q %lu\n",
+ space, sbspace(&to->so_snd), from->so_rcv.sb_cc);
+ return 0;
+ }
+
+ /* ... read off from socket ... */
+ top = NULL;
+ bzero(&uio, sizeof(uio));
+ uio.uio_resid = space;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_procp = NULL; /* XXX: p */
+ uio.uio_iov = NULL;
+ uio.uio_iovcnt = 0;
+ rcvflags = MSG_DONTWAIT;
+
+ /*
+ * XXX JRS: try instead
+ * (*from->so_recv)(from, &nam, &uio, &top, (struct mbuf**)0, &flags)
+ * as in sys/nfs/nfs_socket.c
+ */
+ error = soreceive(from, /* &paddr */ NULL, &uio,
+ &top, /*controlp*/ NULL, &rcvflags);
+#if defined(DEBUG)
+ printf("asked for %ld got resid %ld\n", space, (long)uio.uio_resid);
+#endif
+
+ if (error != 0 || top == NULL) {
+ printf("splice: after recv, error = %d, top = %p\n",
+ error, top);
+ soshutdown(from, SHUT_RDWR);
+ soshutdown(to, SHUT_RDWR);
+ goto done;
+ }
+
+ /* send it ... */
+#if 0
+ error = (*to->so_send)(to, (struct mbuf*)NULL, (struct uio *)NULL,
+ top, NULL, 0, p);
+#else
+ s = splsoftnet();
+ error = (*to->so_proto->pr_usrreq)(to, PRU_SEND, top, NULL, NULL, p);
+ splx(s);
+#endif
+
+done:
+ return (error);
+}
+
+
+int
+splice1(struct socket *s1, struct socket *s2, struct proc *p)
+{
+
+ int error = 0;;
+
+#define SPLICE_MOVEDATA(a, b, p) \
+do { \
+ if (soreadable((a)) && sowritable(b)) \
+ error = splice_movedata((a), (b), (p)); \
+} while (0)
+
+ SPLICE_MOVEDATA(s1, s2, p);
+ if (error != 0)
+ return error;
+
+ SPLICE_MOVEDATA(s2, s1, p);
+
+#undef SPLICE_MOVEDATA
+
+ return error;
+
+
+}
+
+
+void
+splice_sowake_tramp(struct socket *so, caddr_t upcallarg, int mflags)
+{
+ /*
+ * This function is called from protocol-specific code
+ * at splsoftnet context, where it is unsafe to
+ * remove or add data to the socket's so_snd or so_rcv queues.
+ * Schedule a kcont to run at a safer time.
+ */
+ struct splice_state *splc = (struct splice_state *)upcallarg;
+ struct kc *kc;
+
+ kc = (so == splc->splc_so1) ? &splc->so1_kc : &splc->so2_kc;
+
+ kcont(kc, splice_sowake_continuation, splc, KC_IPL_DEFER_SOFTNET);
+ kcont_defer(kc, so, 0);
+}
+
+
+/*
+ * Called via kcont(9) from some suitable kcont level,
+ * after splice_sowakeup_tramp() has scheduled the kcont.
+ *
+ * When this function is called, we are no longer being called from
+ * protocol-specific code like (for example) tcp_input(); we can
+ * safely call soreceive() to fetch received data from one socket,
+ * then pass the received data to sosend().
+ */
+void
+splice_sowake_continuation(void * obj, void * kc_env, int status)
+{
+ struct socket *so = obj;
+ struct splice_state * knot = kc_env;
+ struct socket *so1, *so2;
+ struct proc *p;
+
+ /* Which direction were we called in ? */
+ so1 = (so == knot->splc_so1) ? knot->splc_so1 : knot->splc_so2 ;
+ so2 = (so == knot->splc_so1) ? knot->splc_so2 : knot->splc_so1 ;
+ p = knot->splc_proc;
+
+ splice1(so1, so2, p);
+
+ /*
+ * XXX: check for clean close in both directions.
+ * when we are done, wakeup the userspace caller sleeping
+ * on the address of our struct splice_state.
+ */
+#define CANSENDFROMTO(s1, s2) \
+ ( ( ((s1)->so_state & SS_CANTRCVMORE) == 0) && \
+ ( ((s2)->so_state & SS_CANTSENDMORE) == 0) \
+ )
+
+
+ if (CANSENDFROMTO(so1, so2) ||
+ CANSENDFROMTO(so2, so1)) {
+ /* can send more; do not close down yet. */
+ } else {
+/*DBG*/ printf("splice state %p so %p %p: done\n", knot, so1, so2);
+ wakeup(knot);
+ }
+}
+
+
+
+
+/* ARGSUSED */
+int
+sys_splice(struct lwp *l, void *v, register_t *retval)
+{
+
+ struct sys_splice_args /**/ {
+ syscallarg(int) fd1;
+ syscallarg(int) fd2;
+ } /* */ *uap = v;
+
+
+ int fd1, fd2;
+ struct filedesc *fdp;
+ struct file *fp1, *fp2;
+ struct socket *so1, *so2;
+ struct proc *p;
+ int error;
+ int s;
+ struct splice_state *knot;
+
+ p = l->l_proc;
+ fdp = p->p_fd;
+ error = 0;
+
+ fd1 = SCARG(uap, fd1);
+ if ((fp1 = fd_getfile(fdp, fd1)) == NULL)
+ return (EBADF);
+ FILE_USE(fp1);
+
+ fd2 = SCARG(uap, fd2);
+ if ((fp2 = fd_getfile(fdp, fd2)) == NULL) {
+ FILE_UNUSE(fp1, p);
+ return (EBADF);
+ }
+ FILE_USE(fp2);
+
+ if (fp1->f_type != DTYPE_SOCKET ||
+ fp2->f_type != DTYPE_SOCKET) {
+ error = ENOTSOCK; /* XXX EINVAL? */
+ goto done;
+ }
+
+ so1 = (struct socket*) fp1->f_data;
+ so2 = (struct socket*) fp2->f_data;
+
+ /* Same underlying socket?
+ * XXX: should handle splice-to-self as as special case, for
+ * test/measurement purposes? I think all we have to do
+ * is not acquire the sblock twice (and not release twice).
+ */
+ if (so1 == so2) {
+ error = EBADF;
+ goto done;
+ }
+
+
+
+ if (so1->so_type != so2->so_type) {
+ error = ESOCKTNOSUPPORT;
+ goto done;
+ }
+
+ /* XXX: implementation is really only SOCK-STREAM for now */
+ if (so1->so_type != SOCK_STREAM) {
+ error = ESOCKTNOSUPPORT;
+ goto done;
+ }
+
+ if ((so1->so_state & SS_ISCONNECTED) == 0 ||
+ (so2->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto done;
+ }
+
+ /*
+ * Deadlock avoidance: ensure we obtain locks in a well-ordered
+ * manner irrespective of the order in which callers supply
+ * sockets.
+ */
+ if (so2 < so1) {
+ struct socket *sotmp = so2;
+ so2 = so1;
+ so1 = sotmp;
+ }
+
+ sblock(&so1->so_snd, M_WAITOK);
+ sblock(&so2->so_snd, M_WAITOK);
+
+ knot = malloc(sizeof(*knot), M_TEMP, M_WAITOK|M_ZERO);
+ if (knot == NULL) {
+ error = ENOSPC;
+ sbunlock(&so1->so_snd);
+ sbunlock(&so2->so_snd);
+ goto done;
+ }
+
+/*DBG*/ printf("..splice socks %p %p state %p: setting up\n", so1, so2, knot);
+
+
+ /*
+ * XXX: set up kconts to shuffle data from one socket
+ * to another, whenever space permits...
+ */
+
+ knot = malloc(sizeof(*knot), M_TEMP, M_WAITOK|M_ZERO);
+ if (knot == NULL) {
+ error = ENOSPC;
+ goto done;
+ }
+
+ knot->splc_error = 0;
+ knot->splc_so1 = so1;
+ knot->splc_so2 = so2;
+ knot->splc_proc = p;
+
+ /* wire up socket upcalls */
+ s = splnet();
+ so1->so_upcall = splice_sowake_tramp;
+ so1->so_upcallarg = (caddr_t)knot;
+ so1->so_rcv.sb_flags |= SB_UPCALL;
+ so1->so_snd.sb_flags |= SB_UPCALL;
+
+ so2->so_upcall = splice_sowake_tramp;
+ so2->so_upcallarg = (caddr_t)knot;
+ so2->so_rcv.sb_flags |= SB_UPCALL;
+ so2->so_snd.sb_flags |= SB_UPCALL;
+ splx(s);
+
+ /* wait until both sockets are done */
+ tsleep((caddr_t)knot, PSOCK, spliceio, 0);
+
+/*DBG*/ printf("..splice done\n");
+
+ /* unwire upcalls */
+ s = splnet();
+ so1->so_snd.sb_flags &= ~SB_UPCALL;
+ so1->so_rcv.sb_flags &= ~SB_UPCALL;
+ so1->so_upcall = NULL;
+ so1->so_upcallarg = NULL;
+
+ so2->so_snd.sb_flags &= ~SB_UPCALL;
+ so2->so_rcv.sb_flags &= ~SB_UPCALL;
+ so2->so_upcall = NULL;
+ so2->so_upcallarg = NULL;
+
+#ifdef notyet
+freesplc:
+#endif
+ free(knot, M_TEMP);
+
+done:
+ FILE_UNUSE(fp1, p);
+ FILE_UNUSE(fp2, p);
+
+ return (error);
+}
+
+int
+sys_sendfile(struct lwp *l, void *v, register_t *retval)
+{
+
+ return ENOSYS;
+}
Index: kern/init_sysent.c
===================================================================
RCS file: /cvsroot/src/sys/kern/init_sysent.c,v
retrieving revision 1.163
diff -w -u -r1.163 init_sysent.c
--- kern/init_sysent.c 27 Feb 2005 00:02:40 -0000 1.163
+++ kern/init_sysent.c 1 Sep 2005 23:55:20 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: init_sysent.c,v 1.163 2005/02/27 00:02:40 perry Exp $ */
+/* $NetBSD$ */
/*
* System call switch table.
@@ -8,7 +8,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: init_sysent.c,v 1.163 2005/02/27 00:02:40 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD$");
#include "opt_ktrace.h"
#include "opt_nfsserver.h"
@@ -994,10 +994,10 @@
sys_pselect }, /* 373 = pselect */
{ 4, s(struct sys_pollts_args), 0,
sys_pollts }, /* 374 = pollts */
- { 0, 0, 0,
- sys_nosys }, /* 375 = filler */
- { 0, 0, 0,
- sys_nosys }, /* 376 = filler */
+ { 2, s(struct sys_splice_args), 0,
+ sys_splice }, /* 375 = splice */
+ { 7, s(struct sys_sendfile_args), 0,
+ sys_sendfile }, /* 376 = sendfile */
{ 0, 0, 0,
sys_nosys }, /* 377 = filler */
{ 0, 0, 0,
Index: kern/syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/syscalls.c,v
retrieving revision 1.158
diff -w -u -r1.158 syscalls.c
--- kern/syscalls.c 27 Feb 2005 00:02:40 -0000 1.158
+++ kern/syscalls.c 1 Sep 2005 23:55:20 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: syscalls.c,v 1.158 2005/02/27 00:02:40 perry Exp $ */
+/* $NetBSD$ */
/*
* System call names.
@@ -8,7 +8,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: syscalls.c,v 1.158 2005/02/27 00:02:40 perry Exp $");
+__KERNEL_RCSID(0, "$NetBSD$");
#if defined(_KERNEL_OPT)
#include "opt_ktrace.h"
@@ -510,4 +510,6 @@
"extattr_list_link", /* 372 = extattr_list_link */
"pselect", /* 373 = pselect */
"pollts", /* 374 = pollts */
+ "splice", /* 375 = splice */
+ "sendfile", /* 376 = sendfile */
};
Index: kern/uipc_syscalls.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.90
diff -w -u -r1.90 uipc_syscalls.c
--- kern/uipc_syscalls.c 26 Feb 2005 21:34:55 -0000 1.90
+++ kern/uipc_syscalls.c 1 Sep 2005 23:55:22 -0000
@@ -1116,3 +1116,5 @@
*fpp = fp;
return (0);
}
+
+#include "/local/home/jonathan/sys_splice.c"
Index: sys/syscall.h
===================================================================
RCS file: /cvsroot/src/sys/sys/syscall.h,v
retrieving revision 1.156
diff -w -u -r1.156 syscall.h
--- sys/syscall.h 27 Feb 2005 00:03:25 -0000 1.156
+++ sys/syscall.h 1 Sep 2005 23:55:23 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: syscall.h,v 1.156 2005/02/27 00:03:25 perry Exp $ */
+/* $NetBSD$ */
/*
* System call numbers.
@@ -1025,5 +1025,11 @@
/* syscall: "pollts" ret: "int" args: "struct pollfd *" "u_int" "const struct timespec *" "const sigset_t *" */
#define SYS_pollts 374
-#define SYS_MAXSYSCALL 375
+/* syscall: "splice" ret: "int" args: "int" "int" */
+#define SYS_splice 375
+
+/* syscall: "sendfile" ret: "int" args: "int" "int" "off_t" "size_t" "void *" "off_t *" "int" */
+#define SYS_sendfile 376
+
+#define SYS_MAXSYSCALL 377
#define SYS_NSYSENT 512
Index: sys/syscallargs.h
===================================================================
RCS file: /cvsroot/src/sys/sys/syscallargs.h,v
retrieving revision 1.138
diff -w -u -r1.138 syscallargs.h
--- sys/syscallargs.h 27 Feb 2005 00:03:25 -0000 1.138
+++ sys/syscallargs.h 1 Sep 2005 23:55:23 -0000
@@ -1,4 +1,4 @@
-/* $NetBSD: syscallargs.h,v 1.138 2005/02/27 00:03:25 perry Exp $ */
+/* $NetBSD$ */
/*
* System call argument lists.
@@ -1588,6 +1588,21 @@
syscallarg(const sigset_t *) mask;
};
+struct sys_splice_args {
+ syscallarg(int) fd1;
+ syscallarg(int) fd2;
+};
+
+struct sys_sendfile_args {
+ syscallarg(int) fd;
+ syscallarg(int) s;
+ syscallarg(off_t) offset;
+ syscallarg(size_t) nbytes;
+ syscallarg(void *) hdtr;
+ syscallarg(off_t *) sbytes;
+ syscallarg(int) flags;
+};
+
/*
* System call prototypes.
*/
@@ -2251,4 +2266,8 @@
int sys_pollts(struct lwp *, void *, register_t *);
+int sys_splice(struct lwp *, void *, register_t *);
+
+int sys_sendfile(struct lwp *, void *, register_t *);
+
#endif /* _SYS__SYSCALLARGS_H_ */