Subject: Patch for two mbuf dma optimizations
To: None <tech-kern@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-kern
Date: 03/29/2003 12:22:53
--8t9RHnE3ZwKMSgU+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Folks...

This is the second in a series of simple patches to improve network
performance being contributed by Wasabi Systems.

Building on the pool cache paddr patch, this patch does three things:

	* Caches physical addresses of mbufs and clusters in the
	  m_hdr and m_ext, respectively.  This allows bus_dma back-ends
	  to avoid having to extract the physical address from the
	  virtual when dealing with plain mbufs and clusters.

	* For mbuf external data which is the result of sosend_loan,
	  remember pointers to the vm_page's for the loaned pages.
	  Initially, this saves some work when freeing the loaned
	  area.  Eventually, it can be used by bus_dma back-ends
	  to avoid having to extract the physical address from the
	  virtual.

	* Add a new M_EXT_ROMAP bit, which indicates that the mbuf
	  external data is mapped read-only at the MMU.  On some
	  platforms, this implies that all cache lines associated
	  with the buffer are clean, so the bus_dma back-end can
	  skip cleaning the cache for such buffers.  Use this bit
	  for sosend_loan'd buffers, since loaned pages are always
	  mapped read-only.

Patches for ARM and i386 bus_dma back-ends are forthcoming.

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--8t9RHnE3ZwKMSgU+
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=mbuf-patch

Index: sys/mbuf.h
===================================================================
RCS file: /cvsroot/src/sys/sys/mbuf.h,v
retrieving revision 1.79
diff -c -r1.79 mbuf.h
*** sys/mbuf.h	2003/03/22 02:21:57	1.79
--- sys/mbuf.h	2003/03/29 18:55:49
***************
*** 134,139 ****
--- 134,140 ----
  	struct	mowner *mh_owner;	/* mbuf owner */
  	int	mh_len;			/* amount of data in this mbuf */
  	int	mh_flags;		/* flags; see below */
+ 	paddr_t	mh_paddr;		/* physical address of mbuf */
  	short	mh_type;		/* type of data in this mbuf */
  };
  
***************
*** 172,177 ****
--- 173,184 ----
  #define	M_CSUM_IPv4		0x00000040	/* IPv4 header */
  #define	M_CSUM_IPv4_BAD		0x00000080	/* IPv4 header checksum bad */
  
+ /*
+  * Max # of pages we can attach to m_ext.  This is carefully chosen
+  * to be able to handle SOSEND_LOAN_CHUNK on a 4K page size machine.
+  */
+ #define	M_EXT_MAXPAGES		((65536 / 4096) + 1)
+ 
  /* description of external storage mapped into mbuf, valid if M_EXT set */
  struct m_ext {
  	caddr_t	ext_buf;		/* start of buffer */
***************
*** 182,187 ****
--- 189,201 ----
  	struct malloc_type *ext_type;	/* malloc type */
  	struct mbuf *ext_nextref;
  	struct mbuf *ext_prevref;
+ 	union {
+ 		paddr_t extun_paddr;	/* physical address (M_EXT_CLUSTER) */
+ 					/* pages (M_EXT_PAGES) */
+ 		struct vm_page *extun_pgs[M_EXT_MAXPAGES];
+ 	} ext_un;
+ #define	ext_paddr	ext_un.extun_paddr
+ #define	ext_pgs		ext_un.extun_pgs
  #ifdef DEBUG
  	const char *ext_ofile;
  	const char *ext_nfile;
***************
*** 190,195 ****
--- 204,211 ----
  #endif
  };
  
+ #define	M_PADDR_INVALID		POOL_PADDR_INVALID
+ 
  struct mbuf {
  	struct	m_hdr m_hdr;
  	union {
***************
*** 210,215 ****
--- 226,232 ----
  #define	m_type		m_hdr.mh_type
  #define	m_flags		m_hdr.mh_flags
  #define	m_nextpkt	m_hdr.mh_nextpkt
+ #define	m_paddr		m_hdr.mh_paddr
  #define	m_pkthdr	M_dat.MH.MH_pkthdr
  #define	m_ext		M_dat.MH.MH_dat.MH_ext
  #define	m_pktdat	M_dat.MH.MH_dat.MH_databuf
***************
*** 237,242 ****
--- 254,261 ----
  /* additional flags for M_EXT mbufs */
  #define	M_EXT_FLAGS	0xff000000
  #define	M_EXT_CLUSTER	0x01000000	/* ext is a cluster */
+ #define	M_EXT_PAGES	0x02000000	/* ext_pgs is valid */
+ #define	M_EXT_ROMAP	0x04000000	/* ext mapping is r-o at MMU */
  
  /* for source-level compatibility */
  #define	M_CLUSTER	M_EXT_CLUSTER
***************
*** 453,460 ****
  do {									\
  	MBUFLOCK(							\
  		(m)->m_ext.ext_buf =					\
! 		    pool_cache_get(&mclpool_cache, (how) == M_WAIT ?	\
! 			(PR_WAITOK|PR_LIMITFAIL) : 0);			\
  		if ((m)->m_ext.ext_buf != NULL)				\
  			_MOWNERREF((m), M_EXT|M_CLUSTER);		\
  	);								\
--- 472,480 ----
  do {									\
  	MBUFLOCK(							\
  		(m)->m_ext.ext_buf =					\
! 		    pool_cache_get_paddr(&mclpool_cache,		\
! 		        (how) == M_WAIT ? (PR_WAITOK|PR_LIMITFAIL) : 0,	\
! 			&(m)->m_ext.ext_paddr);				\
  		if ((m)->m_ext.ext_buf != NULL)				\
  			_MOWNERREF((m), M_EXT|M_CLUSTER);		\
  	);								\
***************
*** 465,470 ****
--- 485,491 ----
  		(m)->m_ext.ext_size = MCLBYTES;				\
  		(m)->m_ext.ext_free = NULL;				\
  		(m)->m_ext.ext_arg = NULL;				\
+ 		/* ext_paddr initialized above */			\
  		MCLINITREFERENCE(m);					\
  	}								\
  } while (/* CONSTCOND */ 0)
***************
*** 505,511 ****
  		_MCLDEREFERENCE(m);					\
  		splx(_ms_);						\
  	} else if ((m)->m_flags & M_CLUSTER) {				\
! 		pool_cache_put(&mclpool_cache, (m)->m_ext.ext_buf);	\
  		splx(_ms_);						\
  	} else if ((m)->m_ext.ext_free) {				\
  		/*							\
--- 526,533 ----
  		_MCLDEREFERENCE(m);					\
  		splx(_ms_);						\
  	} else if ((m)->m_flags & M_CLUSTER) {				\
! 		pool_cache_put_paddr(&mclpool_cache, (m)->m_ext.ext_buf,\
! 		    (m)->m_ext.ext_paddr);				\
  		splx(_ms_);						\
  	} else if ((m)->m_ext.ext_free) {				\
  		/*							\
***************
*** 554,561 ****
  				_MCLDEREFERENCE(m);			\
  				pool_cache_put(&mbpool_cache, (m));	\
  			} else if ((m)->m_flags & M_CLUSTER) {		\
! 				pool_cache_put(&mclpool_cache,		\
! 				    (m)->m_ext.ext_buf);		\
  				pool_cache_put(&mbpool_cache, (m));	\
  			} else if ((m)->m_ext.ext_free) {		\
  				/*					\
--- 576,584 ----
  				_MCLDEREFERENCE(m);			\
  				pool_cache_put(&mbpool_cache, (m));	\
  			} else if ((m)->m_flags & M_CLUSTER) {		\
! 				pool_cache_put_paddr(&mclpool_cache,	\
! 				    (m)->m_ext.ext_buf,			\
! 				    (m)->m_ext.ext_paddr);		\
  				pool_cache_put(&mbpool_cache, (m));	\
  			} else if ((m)->m_ext.ext_free) {		\
  				/*					\
***************
*** 617,622 ****
--- 640,651 ----
  	  (((m)->m_flags & M_CLUSTER) == 0 || MCLISREFERENCED(m)))
  
  /*
+  * Determine if an mbuf's data area is read-only at the MMU.
+  */
+ #define	M_ROMAP(m)							\
+ 	(((m)->m_flags & (M_EXT|M_EXT_ROMAP)) == (M_EXT|M_EXT_ROMAP))
+ 
+ /*
   * Compute the amount of space available
   * before the current start of data in an mbuf.
   */
***************
*** 639,644 ****
--- 668,681 ----
  
  #define	M_TRAILINGSPACE(m)						\
  	(M_READONLY((m)) ? 0 : _M_TRAILINGSPACE((m)))
+ 
+ /*
+  * Compute the offset of the beginning of the data buffer of a non-ext
+  * mbuf.
+  */
+ #define	M_BUFOFFSET(m)							\
+ 	(((m)->m_flags & M_PKTHDR) ?					\
+ 	 offsetof(struct mbuf, m_pktdat) : offsetof(struct mbuf, m_dat))
  
  /*
   * Arrange to prepend space of size plen to mbuf m.
Index: kern/uipc_mbuf.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.64
diff -c -r1.64 uipc_mbuf.c
*** kern/uipc_mbuf.c	2003/02/26 06:31:11	1.64
--- kern/uipc_mbuf.c	2003/03/29 18:55:52
***************
*** 91,97 ****
  
  #include <net/if.h>
  
! #include <uvm/uvm_extern.h>
  
  
  struct	pool mbpool;		/* mbuf pool */
--- 91,97 ----
  
  #include <net/if.h>
  
! #include <uvm/uvm.h>
  
  
  struct	pool mbpool;		/* mbuf pool */
***************
*** 106,111 ****
--- 106,113 ----
  int	max_hdr;
  int	max_datalen;
  
+ static int mb_ctor(void *, void *, int);
+ 
  void	*mclpool_alloc(struct pool *, int);
  void	mclpool_release(struct pool *, void *);
  
***************
*** 147,153 ****
  	pool_set_drain_hook(&mbpool, m_reclaim, NULL);
  	pool_set_drain_hook(&mclpool, m_reclaim, NULL);
  
! 	pool_cache_init(&mbpool_cache, &mbpool, NULL, NULL, NULL);
  	pool_cache_init(&mclpool_cache, &mclpool, NULL, NULL, NULL);
  
  	/*
--- 149,155 ----
  	pool_set_drain_hook(&mbpool, m_reclaim, NULL);
  	pool_set_drain_hook(&mclpool, m_reclaim, NULL);
  
! 	pool_cache_init(&mbpool_cache, &mbpool, mb_ctor, NULL, NULL);
  	pool_cache_init(&mclpool_cache, &mclpool, NULL, NULL, NULL);
  
  	/*
***************
*** 287,292 ****
--- 289,308 ----
  {
  
  	uvm_km_free_poolpage1(mb_map, (vaddr_t)v);
+ }
+ 
+ /*ARGSUSED*/
+ static int
+ mb_ctor(void *arg, void *object, int flags)
+ {
+ 	struct mbuf *m = object;
+ 
+ #ifdef POOL_VTOPHYS
+ 	m->m_paddr = POOL_VTOPHYS(m);
+ #else
+ 	m->m_paddr = M_PADDR_INVALID;
+ #endif
+ 	return (0);
  }
  
  void
Index: kern/uipc_socket.c
===================================================================
RCS file: /cvsroot/src/sys/kern/uipc_socket.c,v
retrieving revision 1.78
diff -c -r1.78 uipc_socket.c
*** kern/uipc_socket.c	2003/02/26 06:31:11	1.78
--- kern/uipc_socket.c	2003/03/29 18:55:53
***************
*** 154,162 ****
  #define	SOCK_LOAN_CHUNK		65536
  
  static void
! sodoloanfree(caddr_t buf, size_t size)
  {
- 	struct vm_page **pgs;
  	vaddr_t va, sva, eva;
  	vsize_t len;
  	paddr_t pa;
--- 154,161 ----
  #define	SOCK_LOAN_CHUNK		65536
  
  static void
! sodoloanfree(struct vm_page **pgs, caddr_t buf, size_t size)
  {
  	vaddr_t va, sva, eva;
  	vsize_t len;
  	paddr_t pa;
***************
*** 167,178 ****
  	len = eva - sva;
  	npgs = len >> PAGE_SHIFT;
  
! 	pgs = alloca(npgs * sizeof(*pgs));
  
! 	for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) {
! 		if (pmap_extract(pmap_kernel(), va, &pa) == FALSE)
! 			panic("sodoloanfree: va 0x%lx not mapped", va);
! 		pgs[i] = PHYS_TO_VM_PAGE(pa);
  	}
  
  	pmap_kremove(sva, len);
--- 166,179 ----
  	len = eva - sva;
  	npgs = len >> PAGE_SHIFT;
  
! 	if (__predict_false(pgs == NULL)) {
! 		pgs = alloca(npgs * sizeof(*pgs));
  
! 		for (i = 0, va = sva; va < eva; i++, va += PAGE_SIZE) {
! 			if (pmap_extract(pmap_kernel(), va, &pa) == FALSE)
! 				panic("sodoloanfree: va 0x%lx not mapped", va);
! 			pgs[i] = PHYS_TO_VM_PAGE(pa);
! 		}
  	}
  
  	pmap_kremove(sva, len);
***************
*** 201,207 ****
  		splx(s);
  
  		rv += m->m_ext.ext_size;
! 		sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size);
  		s = splvm();
  		pool_cache_put(&mbpool_cache, m);
  	}
--- 202,210 ----
  		splx(s);
  
  		rv += m->m_ext.ext_size;
! 		sodoloanfree((m->m_flags & M_EXT_PAGES) ?
! 		    m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf,
! 		    m->m_ext.ext_size);
  		s = splvm();
  		pool_cache_put(&mbpool_cache, m);
  	}
***************
*** 214,220 ****
  		splx(s);
  
  		rv += m->m_ext.ext_size;
! 		sodoloanfree(m->m_ext.ext_buf, m->m_ext.ext_size);
  		s = splvm();
  		pool_cache_put(&mbpool_cache, m);
  	}
--- 217,225 ----
  		splx(s);
  
  		rv += m->m_ext.ext_size;
! 		sodoloanfree((m->m_flags & M_EXT_PAGES) ?
! 		    m->m_ext.ext_pgs : NULL, m->m_ext.ext_buf,
! 		    m->m_ext.ext_size);
  		s = splvm();
  		pool_cache_put(&mbpool_cache, m);
  	}
***************
*** 230,236 ****
  	int s;
  
  	if (m == NULL) {
! 		sodoloanfree(buf, size);
  		return;
  	}
  
--- 235,241 ----
  	int s;
  
  	if (m == NULL) {
! 		sodoloanfree(NULL, buf, size);
  		return;
  	}
  
***************
*** 248,254 ****
  	struct iovec *iov = uio->uio_iov;
  	vaddr_t sva, eva;
  	vsize_t len;
- 	struct vm_page **pgs;
  	vaddr_t lva, va;
  	int npgs, s, i, error;
  
--- 253,258 ----
***************
*** 265,270 ****
--- 269,277 ----
  	len = eva - sva;
  	npgs = len >> PAGE_SHIFT;
  
+ 	/* XXX KDASSERT */
+ 	KASSERT(npgs <= M_EXT_MAXPAGES);
+ 
  	while (socurkva + len > somaxkva) {
  		if (sodopendfree(so))
  			continue;
***************
*** 281,290 ****
  		return (0);
  	socurkva += len;
  
- 	pgs = alloca(npgs * sizeof(*pgs));
- 
  	error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
! 	    pgs, UVM_LOAN_TOPAGE);
  	if (error) {
  		uvm_km_free(kernel_map, lva, len);
  		socurkva -= len;
--- 288,295 ----
  		return (0);
  	socurkva += len;
  
  	error = uvm_loan(&uio->uio_procp->p_vmspace->vm_map, sva, len,
! 	    m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
  	if (error) {
  		uvm_km_free(kernel_map, lva, len);
  		socurkva -= len;
***************
*** 292,303 ****
  	}
  
  	for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
! 		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pgs[i]), VM_PROT_READ);
  	pmap_update(pmap_kernel());
  
  	lva += (vaddr_t) iov->iov_base & PAGE_MASK;
  
  	MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so);
  
  	uio->uio_resid -= space;
  	/* uio_offset not updated, not set/used for write(2) */
--- 297,310 ----
  	}
  
  	for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
! 		pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
! 		    VM_PROT_READ);
  	pmap_update(pmap_kernel());
  
  	lva += (vaddr_t) iov->iov_base & PAGE_MASK;
  
  	MEXTADD(m, (caddr_t) lva, space, M_MBUF, soloanfree, so);
+ 	m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
  
  	uio->uio_resid -= space;
  	/* uio_offset not updated, not set/used for write(2) */

--8t9RHnE3ZwKMSgU+--