Subject: Re: Corrupt data when reading filesystems under Linux guest
To: None <port-xen@NetBSD.org>
From: Jed Davis <jdev@panix.com>
List: port-xen
Date: 06/10/2005 01:05:03
In article <d88qle$s6r$1@sea.gmane.org>, Jed Davis <jdev@panix.com> wrote:
> In article <1118281429.532953.2723.nullmailer@yamt.dyndns.org>,
> YAMAMOTO Takashi  <yamt@mwd.biglobe.ne.jp> wrote:
> > 
> > besides that, xen_shm assumes an arbitrary request can be mapped into
> > a single contiguous virtual address range.  it's wrong.
> 
> It looks like it might be possible to fix that by changing xbdback_io
> to send off multiple xbdback_requests.  And with that done right, it
> shouldn't be too hard to glue together request segments that can be
> mapped into contiguous VM.
> 
> So I might even be able to fix this myself, if no-one more knowledgeable
> is working on it.

And I think I have; the case that failed before is now working,
although it's very, very slow because I'm not yet making any attempt to
combine the transfers, and neither is disksort() for reasons Thor just
explained to me out-of-band as I was writing this, so the disk (which is
apparently quite old) gets a lot of 512-byte transfers (which would be
4k if Linux weren't being bizarre).

I'm including the patch I have so far (I'd use MIME, but I'm doing this
through gmane with trn, and am not convinced of my ability to get the
headers right by hand).  I wasn't sure quite of the best way to have the
Xen request be responded to only when the last IO finished; I ended up
malloc(9)ing an int, pointing to it in the xbdback_request, and using
that as a reference count.  And there's also a big FIXME comment that's
due to my not knowing exactly how asynchronous things are here.

The easy next thing: gluing together the scatter/gather segments where
applicable.

The less-easy next thing: gluing together consecutive requests where
applicable; e.g., a 64k transfer broken into 44k and 20k parts.


------------------------------------------------------------------------
Index: sys/arch/xen/xen/xbdback.c
===================================================================
RCS file: /cvsroot/src/sys/arch/xen/xen/xbdback.c,v
retrieving revision 1.4.2.6
diff -u -r1.4.2.6 xbdback.c
--- sys/arch/xen/xen/xbdback.c	28 Apr 2005 10:27:50 -0000	1.4.2.6
+++ sys/arch/xen/xen/xbdback.c	10 Jun 2005 00:12:22 -0000
@@ -119,6 +119,7 @@
 	uint8_t rq_nrsegments; /* number of ma entries */
 	uint8_t rq_operation;
 	unsigned long rq_id;
+	int *rq_iocount; /* number of outstanding I/O's for this Xen event */
 };
 
 struct pool xbdback_request_pool;
@@ -547,7 +548,8 @@
 	struct xbdback_request *xbd_req;
 	int i, error = 0;
 	vaddr_t start_offset; /* start offset in vm area */
-	int req_size;
+	int req_size, buf_flags, sectors_done;
+	int *io_count;
 
 	if (req->nr_segments < 1 ||
 	    req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST ) {
@@ -561,69 +563,89 @@
 		    xbdi->domid, req->device);
 		return EINVAL;
 	}
-
-	xbd_req = pool_get(&xbdback_request_pool, PR_NOWAIT);
-	if (xbd_req == NULL) {
+	if (req->operation == BLKIF_OP_WRITE) {
+		if (vbd->flags & VBD_F_RO) {
+			return EROFS;
+		}
+		buf_flags = B_WRITE | B_CALL;
+	} else {
+		buf_flags = B_READ | B_CALL;
+	}
+	
+	/* malloc an int so the iodone can know when the last io is done */
+	MALLOC(io_count, int*, sizeof (int), M_TEMP, M_NOWAIT|M_ZERO);
+	if (io_count == NULL) {
 		return ENOMEM;
 	}
 
-	xbd_req->rq_xbdi = xbdi;
+	/*
+	 * Map each Xen request segment onto a separate I/O, for now;
+	 * coalescing segments that can use contiguous VM (e.g., the
+	 * case where each segment is a whole page, or successive
+	 * sectors in the same page) is probably worth doing later.
+	 */
+	sectors_done = 0;
+	for (i = 0; i < req->nr_segments; i++) {
+		xbd_req = pool_get(&xbdback_request_pool, PR_NOWAIT);
+		if (xbd_req == NULL) {
+			return ENOMEM;
+		}
 
-	start_offset =
-	    blkif_first_sect(req->frame_and_sects[0]) * VBD_BSIZE;
-	req_size = blkif_last_sect(req->frame_and_sects[0]) - 
-	    blkif_first_sect(req->frame_and_sects[0]) + 1;
-	XENPRINTF(("xbdback_io domain %d: frame_and_sects[0]=0x%lx\n",
-	     xbdi->domid, req->frame_and_sects[0]));
-	for (i = 1; i < req->nr_segments; i++) {
-		XENPRINTF(("xbdback_io domain %d: frame_and_sects[%d]=0x%lx\n",
-		     xbdi->domid, i, req->frame_and_sects[i]));
-		req_size += blkif_last_sect(req->frame_and_sects[i]) - 
+		xbd_req->rq_xbdi = xbdi;
+		xbd_req->rq_nrsegments = 0;
+		xbd_req->rq_iocount = io_count;
+
+		start_offset =
+		    blkif_first_sect(req->frame_and_sects[i]) * VBD_BSIZE;
+		req_size = blkif_last_sect(req->frame_and_sects[i]) - 
 		    blkif_first_sect(req->frame_and_sects[i]) + 1;
-	}
-	if (req_size < 0) {
-		printf("xbdback_io domain %d: negative-size request\n",
-		    xbdi->domid);
-		error = EINVAL;
-		goto end;
-	}
-	req_size = req_size * VBD_BSIZE;
-	XENPRINTF(("xbdback_io domain %d: start sect %d start %d size %d\n",
-	    xbdi->domid, (int)req->sector_number, (int)start_offset, req_size));
+		xbd_req->rq_ma[xbd_req->rq_nrsegments++] =
+		    (req->frame_and_sects[i] & ~PAGE_MASK);
+		XENPRINTF(("xbdback_io domain %d: frame_and_sects[%d]=0x%lx\n",
+		    xbdi->domid, i, req->frame_and_sects[i]));
 
-	BUF_INIT(&xbd_req->rq_buf);
-	if (req->operation == BLKIF_OP_WRITE) {
-		if (vbd->flags & VBD_F_RO) {
-			error = EROFS;
+		if (req_size < 0) {
+			printf("xbdback_io domain %d: negative-size request\n",
+			    xbdi->domid);
+			error = EINVAL;
+			goto end;
+		}
+		XENPRINTF(("xbdback_io domain %d: start sect %d start %d "
+		    "size %d\n", xbdi->domid, (int)req->sector_number,
+		    (int)start_offset, req_size));
+		
+		BUF_INIT(&xbd_req->rq_buf);
+		xbd_req->rq_buf.b_flags = buf_flags;
+		xbd_req->rq_buf.b_iodone = xbdback_iodone;
+		xbd_req->rq_buf.b_proc = NULL;
+		xbd_req->rq_buf.b_vp = vbd->vp;
+		xbd_req->rq_buf.b_dev = vbd->dev;
+		xbd_req->rq_buf.b_blkno = req->sector_number + sectors_done;
+		xbd_req->rq_buf.b_bcount = (daddr_t)(req_size * VBD_BSIZE);
+		xbd_req->rq_buf.b_data = (void *)start_offset;
+		xbd_req->rq_buf.b_private = xbd_req;
+		sectors_done += req_size;
+
+		switch(xbdback_map_shm(req, xbd_req)) {
+		case 0:
+			++*io_count;
+			xbdback_do_io(xbd_req);
+			break;
+		case ENOMEM:
+			if (xen_shm_callback(xbdback_shm_callback, xbd_req) ==
+			    0)
+				break;
+			error = ENOMEM;
+			goto end;
+		default:
+			error = EINVAL;
 			goto end;
 		}
-		xbd_req->rq_buf.b_flags = B_WRITE | B_CALL;
-	} else {
-		xbd_req->rq_buf.b_flags = B_READ | B_CALL;
-	}
-	xbd_req->rq_buf.b_iodone = xbdback_iodone;
-	xbd_req->rq_buf.b_proc = NULL;
-	xbd_req->rq_buf.b_vp = vbd->vp;
-	xbd_req->rq_buf.b_dev = vbd->dev;
-	xbd_req->rq_buf.b_blkno = req->sector_number;
-	xbd_req->rq_buf.b_bcount = (daddr_t)req_size;
-	xbd_req->rq_buf.b_data = (void *)start_offset;
-	xbd_req->rq_buf.b_private = xbd_req;
-	switch(xbdback_map_shm(req, xbd_req)) {
-	case 0:
-		xbdback_do_io(xbd_req);
-		return 0;
-	case ENOMEM:
-		if (xen_shm_callback(xbdback_shm_callback, xbd_req) == 0)
-			return 0;
-		error = ENOMEM;
-		break;
-	default:
-		error = EINVAL;
-		break;
 	}
+	return 0;
 end:
 	pool_put(&xbdback_request_pool, xbd_req);
+	if (*io_count == 0) FREE(io_count, M_TEMP);
 	return error;
 }
 
@@ -667,14 +689,14 @@
 		printf("xbdback_do_io vaddr 0x%lx bcount 0x%x doesn't fit in "
 		    " %d segments\n", bdata, xbd_req->rq_buf.b_bcount,
 		    xbd_req->rq_nrsegments);
-		panic("xbdback_do_io: not enouth segments");
+		panic("xbdback_do_io: not enough segments");
 	}
 	}
 #endif
 	if ((xbd_req->rq_buf.b_flags & B_READ) == 0)
 		xbd_req->rq_buf.b_vp->v_numoutput++;
-	XENPRINTF(("xbdback_io domain %d: start request\n",
-	    xbd_req->rq_xbdi->domid));
+	XENPRINTF(("xbdback_io domain %d: start request %lu nio=%d\n",
+	    xbd_req->rq_xbdi->domid, xbd_req->rq_id, *(xbd_req->rq_iocount)));
 	DEV_STRATEGY(&xbd_req->rq_buf);
 }
 
@@ -692,10 +714,24 @@
 		error = BLKIF_RSP_ERROR;
 	} else
 		error = BLKIF_RSP_OKAY;
-	XENPRINTF(("xbdback_io domain %d: end reqest error=%d\n",
-	    xbdi->domid, error));
-	xbdback_send_reply(xbdi, xbd_req->rq_id,
-	    xbd_req->rq_operation, error);
+
+	if (--*(xbd_req->rq_iocount) <= 0) {
+		/*
+		 * FIXME: could this wind up happening in the middle
+		 * of the xbdback_io that's sending off the I/O's
+		 * whose iodone callbacks are calling this, when it
+		 * still has more I/O's to dispatch for the Xen disk
+		 * request in question?  If so, that would be bad.
+		 */
+		XENPRINTF(("xbdback_io domain %d: end request %lu error=%d done\n",
+		    xbdi->domid, xbd_req->rq_id, error));
+		xbdback_send_reply(xbdi, xbd_req->rq_id,
+		    xbd_req->rq_operation, error);
+		FREE(xbd_req->rq_iocount, M_TEMP);
+	} else {
+		XENPRINTF(("xbdback_io domain %d: end request %lu error=%d left=%d\n",
+		    xbdi->domid, xbd_req->rq_id, error, *(xbd_req->rq_iocount)));
+	}
 	pool_put(&xbdback_request_pool, xbd_req);
 }
 
@@ -720,6 +756,8 @@
 		return ENOMEM;
 	}
 	xbd_req->rq_xbdi = xbdi;
+	xbd_req->rq_nrsegments = 1;
+	xbd_req->rq_ma[0] = (req->frame_and_sects[0] & ~PAGE_MASK);
 	if (xbdback_map_shm(req, xbd_req) != 0) {
 		pool_put(&xbdback_request_pool, xbd_req);
 		return EINVAL;
@@ -767,28 +805,17 @@
 	hypervisor_notify_via_evtchn(xbdi->evtchn);
 }
 
-/* map a request into our virtual address space */
+/*
+ * Map a request into our virtual address space.  The xbd_req->rq_ma
+ * array is to be filled out by the caller.
+ */
 static int
 xbdback_map_shm(blkif_request_t *req, struct xbdback_request *xbd_req)
 {
-	int i;
-
-	xbd_req->rq_nrsegments = req->nr_segments;
 	xbd_req->rq_id = req->id;
 	xbd_req->rq_operation = req->operation;
 
-#ifdef DIAGNOSTIC
-	if (req->nr_segments <= 0 ||
-	    req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
-		printf("xbdback_map_shm: %d segments\n", req->nr_segments);
-		panic("xbdback_map_shm");
-	}
-#endif
-
-	for (i = 0; i < req->nr_segments; i++) {
-		xbd_req->rq_ma[i] = (req->frame_and_sects[i] & ~PAGE_MASK);
-	}
-	return xen_shm_map(xbd_req->rq_ma, req->nr_segments,
+	return xen_shm_map(xbd_req->rq_ma, xbd_req->rq_nrsegments,
 		xbd_req->rq_xbdi->domid, &xbd_req->rq_vaddr, 0);
 }
 
------------------------------------------------------------------------
 
-- 
(let ((C call-with-current-continuation)) (apply (lambda (x y) (x y)) (map
((lambda (r) ((C C) (lambda (s) (r (lambda l (apply (s s) l))))))  (lambda
(f) (lambda (l) (if (null? l) C (lambda (k) (display (car l)) ((f (cdr l))
(C k)))))))    '((#\J #\d #\D #\v #\s) (#\e #\space #\a #\i #\newline)))))