Subject: Patch to use mbuf dma optimizations in ARM bus_dma back-end
To: None <tech-kern@netbsd.org>
From: Jason R Thorpe <thorpej@wasabisystems.com>
List: tech-kern
Date: 03/29/2003 13:57:10
--6TrnltStXW4iwmi0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Here is the fourth and final patch in the series of simple patches
to improve network performance contributed by Wasabi Systems.

This makes the ARM bus_dma back-end use the cached physical addresses
for mbufs and clusters, and the read-only mapping indication to avoid
redundant cache cleans.

-- 
        -- Jason R. Thorpe <thorpej@wasabisystems.com>

--6TrnltStXW4iwmi0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=arm32-bus_dma-patch

Index: bus_dma.c
===================================================================
RCS file: /cvsroot/src/sys/arch/arm/arm32/bus_dma.c,v
retrieving revision 1.26
diff -c -r1.26 bus_dma.c
*** bus_dma.c	2003/03/27 19:42:30	1.26
--- bus_dma.c	2003/03/29 21:51:00
***************
*** 213,218 ****
--- 213,219 ----
  _bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t map, struct mbuf *m0,
      int flags)
  {
+ 	struct arm32_dma_range *dr;
  	paddr_t lastaddr;
  	int seg, error, first;
  	struct mbuf *m;
***************
*** 236,250 ****
  	if (m0->m_pkthdr.len > map->_dm_size)
  		return (EINVAL);
  
! 	/* _bus_dmamap_load_buffer() clears this if we're not... */
! 	map->_dm_flags |= ARM32_DMAMAP_COHERENT;
  
  	first = 1;
  	seg = 0;
  	error = 0;
  	for (m = m0; m != NULL && error == 0; m = m->m_next) {
! 		error = _bus_dmamap_load_buffer(t, map, m->m_data, m->m_len,
! 		    NULL, flags, &lastaddr, &seg, first);
  		first = 0;
  	}
  	if (error == 0) {
--- 237,302 ----
  	if (m0->m_pkthdr.len > map->_dm_size)
  		return (EINVAL);
  
! 	/*
! 	 * Mbuf chains should almost never have coherent (i.e.
! 	 * un-cached) mappings, so clear that flag now.
! 	 */
! 	map->_dm_flags &= ~ARM32_DMAMAP_COHERENT;
  
  	first = 1;
  	seg = 0;
  	error = 0;
  	for (m = m0; m != NULL && error == 0; m = m->m_next) {
! 		if (m->m_len == 0)
! 			continue;
! 		/* XXX Could be better about coalescing. */
! 		/* XXX Doesn't check boundaries. */
! 		switch (m->m_flags & (M_EXT|M_CLUSTER)) {
! 		case M_EXT|M_CLUSTER:
! 			/* XXX KDASSERT */
! 			KASSERT(m->m_ext.ext_paddr != M_PADDR_INVALID);
! 			lastaddr = m->m_ext.ext_paddr +
! 			    (m->m_data - m->m_ext.ext_buf);
!  have_addr:
! 			if (first == 0 &&
! 			    ++seg >= map->_dm_segcnt) {
! 				error = EFBIG;
! 				break;
! 			}
! 			/*
! 			 * Make sure we're in an allowed DMA range.
! 			 */
! 			if (t->_ranges != NULL) {
! 				/* XXX cache last result? */
! 				dr = _bus_dma_inrange(t->_ranges, t->_nranges,
! 				    lastaddr);
! 				if (dr == NULL) {
! 					error = EINVAL;
! 					break;
! 				}
! 			
! 				/*
! 				 * In a valid DMA range.  Translate the
! 				 * physical memory address to an address
! 				 * in the DMA window.
! 				 */
! 				lastaddr = (lastaddr - dr->dr_sysbase) +
! 				    dr->dr_busbase;
! 			}
! 			map->dm_segs[seg].ds_addr = lastaddr;
! 			map->dm_segs[seg].ds_len = m->m_len;
! 			lastaddr += m->m_len;
! 			break;
! 
! 		case 0:
! 			lastaddr = m->m_paddr + M_BUFOFFSET(m) +
! 			    (m->m_data - M_BUFADDR(m));
! 			goto have_addr;
! 
! 		default:
! 			error = _bus_dmamap_load_buffer(t, map, m->m_data,
! 			    m->m_len, NULL, flags, &lastaddr, &seg, first);
! 		}
  		first = 0;
  	}
  	if (error == 0) {
***************
*** 409,418 ****
  		maddr = mtod(m, vaddr_t);
  		maddr += moff;
  
  		switch (ops) {
  		case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
! 			cpu_dcache_wbinv_range(maddr, minlen);
! 			break;
  
  		case BUS_DMASYNC_PREREAD:
  			if (((maddr | minlen) & arm_dcache_align_mask) == 0)
--- 461,489 ----
  		maddr = mtod(m, vaddr_t);
  		maddr += moff;
  
+ 		/*
+ 		 * We can save a lot of work here if we know the mapping
+ 		 * is read-only at the MMU:
+ 		 *
+ 		 * If a mapping is read-only, no dirty cache blocks will
+ 		 * exist for it.  If a writable mapping was made read-only,
+ 		 * we know any dirty cache lines for the range will have
+ 		 * been cleaned for us already.  Therefore, if the upper
+ 		 * layer can tell us we have a read-only mapping, we can
+ 		 * skip all cache cleaning.
+ 		 *
+ 		 * NOTE: This only works if we know the pmap cleans pages
+ 		 * before making a read-write -> read-only transition.  If
+ 		 * this ever becomes non-true (e.g. Physically Indexed
+ 		 * cache), this will have to be revisited.
+ 		 */
  		switch (ops) {
  		case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
! 			if (! M_ROMAP(m)) {
! 				cpu_dcache_wbinv_range(maddr, minlen);
! 				break;
! 			}
! 			/* else FALLTHROUGH */
  
  		case BUS_DMASYNC_PREREAD:
  			if (((maddr | minlen) & arm_dcache_align_mask) == 0)
***************
*** 422,428 ****
  			break;
  
  		case BUS_DMASYNC_PREWRITE:
! 			cpu_dcache_wb_range(maddr, minlen);
  			break;
  		}
  		moff = 0;
--- 493,500 ----
  			break;
  
  		case BUS_DMASYNC_PREWRITE:
! 			if (! M_ROMAP(m))
! 				cpu_dcache_wb_range(maddr, minlen);
  			break;
  		}
  		moff = 0;

--6TrnltStXW4iwmi0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename=arm32-bus_dma-patch

Index: bus_dma.c
===================================================================
RCS file: /cvsroot/src/sys/arch/arm/arm32/bus_dma.c,v
retrieving revision 1.26
diff -c -r1.26 bus_dma.c
*** bus_dma.c	2003/03/27 19:42:30	1.26
--- bus_dma.c	2003/03/29 21:51:00
***************
*** 213,218 ****
--- 213,219 ----
  _bus_dmamap_load_mbuf(bus_dma_tag_t t, bus_dmamap_t map, struct mbuf *m0,
      int flags)
  {
+ 	struct arm32_dma_range *dr;
  	paddr_t lastaddr;
  	int seg, error, first;
  	struct mbuf *m;
***************
*** 236,250 ****
  	if (m0->m_pkthdr.len > map->_dm_size)
  		return (EINVAL);
  
! 	/* _bus_dmamap_load_buffer() clears this if we're not... */
! 	map->_dm_flags |= ARM32_DMAMAP_COHERENT;
  
  	first = 1;
  	seg = 0;
  	error = 0;
  	for (m = m0; m != NULL && error == 0; m = m->m_next) {
! 		error = _bus_dmamap_load_buffer(t, map, m->m_data, m->m_len,
! 		    NULL, flags, &lastaddr, &seg, first);
  		first = 0;
  	}
  	if (error == 0) {
--- 237,302 ----
  	if (m0->m_pkthdr.len > map->_dm_size)
  		return (EINVAL);
  
! 	/*
! 	 * Mbuf chains should almost never have coherent (i.e.
! 	 * un-cached) mappings, so clear that flag now.
! 	 */
! 	map->_dm_flags &= ~ARM32_DMAMAP_COHERENT;
  
  	first = 1;
  	seg = 0;
  	error = 0;
  	for (m = m0; m != NULL && error == 0; m = m->m_next) {
! 		if (m->m_len == 0)
! 			continue;
! 		/* XXX Could be better about coalescing. */
! 		/* XXX Doesn't check boundaries. */
! 		switch (m->m_flags & (M_EXT|M_CLUSTER)) {
! 		case M_EXT|M_CLUSTER:
! 			/* XXX KDASSERT */
! 			KASSERT(m->m_ext.ext_paddr != M_PADDR_INVALID);
! 			lastaddr = m->m_ext.ext_paddr +
! 			    (m->m_data - m->m_ext.ext_buf);
!  have_addr:
! 			if (first == 0 &&
! 			    ++seg >= map->_dm_segcnt) {
! 				error = EFBIG;
! 				break;
! 			}
! 			/*
! 			 * Make sure we're in an allowed DMA range.
! 			 */
! 			if (t->_ranges != NULL) {
! 				/* XXX cache last result? */
! 				dr = _bus_dma_inrange(t->_ranges, t->_nranges,
! 				    lastaddr);
! 				if (dr == NULL) {
! 					error = EINVAL;
! 					break;
! 				}
! 			
! 				/*
! 				 * In a valid DMA range.  Translate the
! 				 * physical memory address to an address
! 				 * in the DMA window.
! 				 */
! 				lastaddr = (lastaddr - dr->dr_sysbase) +
! 				    dr->dr_busbase;
! 			}
! 			map->dm_segs[seg].ds_addr = lastaddr;
! 			map->dm_segs[seg].ds_len = m->m_len;
! 			lastaddr += m->m_len;
! 			break;
! 
! 		case 0:
! 			lastaddr = m->m_paddr + M_BUFOFFSET(m) +
! 			    (m->m_data - M_BUFADDR(m));
! 			goto have_addr;
! 
! 		default:
! 			error = _bus_dmamap_load_buffer(t, map, m->m_data,
! 			    m->m_len, NULL, flags, &lastaddr, &seg, first);
! 		}
  		first = 0;
  	}
  	if (error == 0) {
***************
*** 409,418 ****
  		maddr = mtod(m, vaddr_t);
  		maddr += moff;
  
  		switch (ops) {
  		case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
! 			cpu_dcache_wbinv_range(maddr, minlen);
! 			break;
  
  		case BUS_DMASYNC_PREREAD:
  			if (((maddr | minlen) & arm_dcache_align_mask) == 0)
--- 461,489 ----
  		maddr = mtod(m, vaddr_t);
  		maddr += moff;
  
+ 		/*
+ 		 * We can save a lot of work here if we know the mapping
+ 		 * is read-only at the MMU:
+ 		 *
+ 		 * If a mapping is read-only, no dirty cache blocks will
+ 		 * exist for it.  If a writable mapping was made read-only,
+ 		 * we know any dirty cache lines for the range will have
+ 		 * been cleaned for us already.  Therefore, if the upper
+ 		 * layer can tell us we have a read-only mapping, we can
+ 		 * skip all cache cleaning.
+ 		 *
+ 		 * NOTE: This only works if we know the pmap cleans pages
+ 		 * before making a read-write -> read-only transition.  If
+ 		 * this ever becomes non-true (e.g. Physically Indexed
+ 		 * cache), this will have to be revisited.
+ 		 */
  		switch (ops) {
  		case BUS_DMASYNC_PREREAD|BUS_DMASYNC_PREWRITE:
! 			if (! M_ROMAP(m)) {
! 				cpu_dcache_wbinv_range(maddr, minlen);
! 				break;
! 			}
! 			/* else FALLTHROUGH */
  
  		case BUS_DMASYNC_PREREAD:
  			if (((maddr | minlen) & arm_dcache_align_mask) == 0)
***************
*** 422,428 ****
  			break;
  
  		case BUS_DMASYNC_PREWRITE:
! 			cpu_dcache_wb_range(maddr, minlen);
  			break;
  		}
  		moff = 0;
--- 493,500 ----
  			break;
  
  		case BUS_DMASYNC_PREWRITE:
! 			if (! M_ROMAP(m))
! 				cpu_dcache_wb_range(maddr, minlen);
  			break;
  		}
  		moff = 0;

--6TrnltStXW4iwmi0--