tech-kern archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index][Old Index]
Re: very bad behavior on overquota writes
On Thu, Nov 22, 2012 at 12:46:54PM +0100, Manuel Bouyer wrote:
> Here's another patch, after comments from chs@ in a private mail.
>
> There really are 2 parts here:
> - avoid unecessery list walk at truncate time. This is the change to
> uvm_vnp_setsize() (truncate between pgend and oldsize instead of 0, which
> always triggers a list walk - we know there is nothing beyond oldsize)
> and ffs_truncate() (vtruncbuf() is needed only for non-VREG files,
> and will always trigger a list walk).
>
> This help for the local write case, where on my test system syscalls/s rise
> from 170 (when write succeed) to 570 (when EDQUOT is returned). Without
> this patch, the syscalls/s would fall to less than 20.
>
> But this doesn't help much for nfsd, which can get write requests way beyond
> the real end of file (the client's idea of end of file is wrong).
> In such a case, the distance between pgend and oldsize in uvm_vnp_setsize()
> is large enough to trigger a list walk, even through there is really only
> a few pages to free. I guess fixing this would require an interface change
> to pass to uvm_vnp_setsize() for which the allocation did really take place.
> But I found a workaround: when a write error occurs, free all the
> pages associated with the vnode in ffs_write(). This way, on subsequent writes
> only new pages should be in the list, and freeing them is fast.
>
> With this change, the nfsd performances don't change when the quota limit
> is hit.
In fact, with this patch alone, the nfsd performances drops
from about 1250 writes/s to 700 when the quota limit is hit (and we keep
a disk activity of 140MB/s when no data write occurs). To preserve the
performance, the attached patch is also needed. What this does is detect early
that we'll have an allocation failure in ffs_balloc(), and bail out
before we allocated any indirect block that we would need to freed in the
error path.
This needs an additionnal flag to to chkdq(), to test a quota allocation
without updating the quota values.
--
Manuel Bouyer <bouyer%antioche.eu.org@localhost>
NetBSD: 26 ans d'experience feront toujours la difference
--
Index: ffs/ffs_alloc.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_alloc.c,v
retrieving revision 1.130
diff -u -p -r1.130 ffs_alloc.c
--- ffs/ffs_alloc.c 28 Nov 2011 08:05:07 -0000 1.130
+++ ffs/ffs_alloc.c 22 Nov 2012 17:27:28 -0000
@@ -110,7 +110,6 @@ static daddr_t ffs_alloccg(struct inode
static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int);
static ino_t ffs_dirpref(struct inode *);
static daddr_t ffs_fragextend(struct inode *, int, daddr_t, int, int);
-static void ffs_fserr(struct fs *, u_int, const char *);
static daddr_t ffs_hashalloc(struct inode *, int, daddr_t, int, int,
daddr_t (*)(struct inode *, int, daddr_t, int, int));
static daddr_t ffs_nodealloccg(struct inode *, int, daddr_t, int, int);
@@ -2014,17 +2013,3 @@ ffs_mapsearch(struct fs *fs, struct cg *
panic("ffs_alloccg: block not in map");
/* return (-1); */
}
-
-/*
- * Fserr prints the name of a file system with an error diagnostic.
- *
- * The form of the error message is:
- * fs: error message
- */
-static void
-ffs_fserr(struct fs *fs, u_int uid, const char *cp)
-{
-
- log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
- uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
-}
Index: ffs/ffs_balloc.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_balloc.c,v
retrieving revision 1.54
diff -u -p -r1.54 ffs_balloc.c
--- ffs/ffs_balloc.c 23 Apr 2011 07:36:02 -0000 1.54
+++ ffs/ffs_balloc.c 22 Nov 2012 17:27:28 -0000
@@ -111,6 +111,7 @@ ffs_balloc_ufs1(struct vnode *vp, off_t
int32_t *blkp, *allocblk, allociblk[NIADDR + 1];
int32_t *allocib;
int unwindidx = -1;
+ off_t wantedblks;
#ifdef FFS_EI
const int needswap = UFS_FSNEEDSWAP(fs);
#endif
@@ -263,9 +264,43 @@ ffs_balloc_ufs1(struct vnode *vp, off_t
return (error);
/*
+ * before doing any allocations, check if we have enough space.
+ * This avoids going through allocations/deallocations if
+ * the filesystem or quota is short in space, and a badly-written
+ * software keeps retrying the write.
+ * Here we may check a size larger than what's really needed
+ * is indirect blocks are already there, but it's too large
+ * by at most 3 blocks. Not a big deal.
+ */
+ wantedblks = (flags & B_METAONLY) ? num : (num + 1);
+ if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0,
+ NULL, NULL, NULL) != 0) {
+ if (blkstofrags(fs, wantedblks) > freespace(fs,
fs->fs_minfree)) {
+ ffs_fserr(fs, kauth_cred_geteuid(cred),
+ "file system full");
+ uprintf("\n%s: write failed, file system is full\n",
+ fs->fs_fsmnt);
+ return (ENOSPC);
+ }
+ } else {
+ if (wantedblks > fs->fs_cstotal.cs_nbfree) {
+ ffs_fserr(fs, kauth_cred_geteuid(cred),
+ "file system full");
+ uprintf("\n%s: write failed, file system is full\n",
+ fs->fs_fsmnt);
+ return (ENOSPC);
+ }
+ }
+
+#if defined(QUOTA) || defined(QUOTA2)
+ if ((error = chkdq(ip, btodb(lblktosize(fs, wantedblks)), cred,
+ TRYONLY)) != 0)
+ return error;
+#endif
+
+ /*
* Fetch the first indirect block allocating if necessary.
*/
-
--num;
nb = ufs_rw32(ip->i_ffs1_ib[indirs[0].in_off], needswap);
allocib = NULL;
@@ -533,6 +571,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t
daddr_t *blkp, *allocblk, allociblk[NIADDR + 1];
int64_t *allocib;
int unwindidx = -1;
+ off_t wantedblks;
#ifdef FFS_EI
const int needswap = UFS_FSNEEDSWAP(fs);
#endif
@@ -793,6 +832,40 @@ ffs_balloc_ufs2(struct vnode *vp, off_t
return (error);
/*
+ * before doing any allocations, check if we have enough space.
+ * This avoids going through allocations/deallocations if
+ * the filesystem or quota is short in space, and a badly-written
+ * software keeps retrying the write.
+ * Here we may check a size larger than what's really needed
+ * is indirect blocks are already there, but it's too large
+ * by at most 3 blocks. Not a big deal.
+ */
+ wantedblks = (flags & B_METAONLY) ? num : (num + 1);
+ if (kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0,
+ NULL, NULL, NULL) != 0) {
+ if (blkstofrags(fs, wantedblks) > freespace(fs,
fs->fs_minfree)) {
+ ffs_fserr(fs, kauth_cred_geteuid(cred),
+ "file system full");
+ uprintf("\n%s: write failed, file system is full\n",
+ fs->fs_fsmnt);
+ return (ENOSPC);
+ }
+ } else {
+ if (wantedblks > fs->fs_cstotal.cs_nbfree) {
+ ffs_fserr(fs, kauth_cred_geteuid(cred),
+ "file system full");
+ uprintf("\n%s: write failed, file system is full\n",
+ fs->fs_fsmnt);
+ return (ENOSPC);
+ }
+ }
+
+#if defined(QUOTA) || defined(QUOTA2)
+ if ((error = chkdq(ip, btodb(lblktosize(fs, wantedblks)), cred,
+ TRYONLY)) != 0)
+ return error;
+#endif
+ /*
* Fetch the first indirect block allocating if necessary.
*/
Index: ffs/ffs_extern.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_extern.h,v
retrieving revision 1.78
diff -u -p -r1.78 ffs_extern.h
--- ffs/ffs_extern.h 17 Jun 2011 14:23:52 -0000 1.78
+++ ffs/ffs_extern.h 22 Nov 2012 17:27:28 -0000
@@ -193,6 +193,7 @@ void ffs_cg_swap(struct cg *, struct cg
#if defined(_KERNEL)
void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t);
int ffs_getblk(struct vnode *, daddr_t, daddr_t, int, bool, buf_t **);
+void ffs_fserr(struct fs *, u_int, const char *);
#endif /* defined(_KERNEL) */
void ffs_fragacct(struct fs *, int, int32_t[], int, int);
int ffs_isblock(struct fs *, u_char *, int32_t);
Index: ffs/ffs_subr.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ffs/ffs_subr.c,v
retrieving revision 1.47
diff -u -p -r1.47 ffs_subr.c
--- ffs/ffs_subr.c 14 Aug 2011 12:37:09 -0000 1.47
+++ ffs/ffs_subr.c 22 Nov 2012 17:27:28 -0000
@@ -64,6 +64,7 @@ void panic(const char *, ...)
#include <sys/inttypes.h>
#include <sys/pool.h>
#include <sys/fstrans.h>
+#include <sys/syslog.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
@@ -132,6 +133,20 @@ ffs_getblk(struct vnode *vp, daddr_t lbl
return error;
}
+
+/*
+ * Fserr prints the name of a file system with an error diagnostic.
+ *
+ * The form of the error message is:
+ * fs: error message
+ */
+void
+ffs_fserr(struct fs *fs, u_int uid, const char *cp)
+{
+
+ log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
+ uid, curproc->p_pid, curproc->p_comm, fs->fs_fsmnt, cp);
+}
#endif /* _KERNEL */
/*
Index: ufs/ufs_extern.h
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_extern.h,v
retrieving revision 1.71
diff -u -p -r1.71 ufs_extern.h
--- ufs/ufs_extern.h 1 Feb 2012 05:34:43 -0000 1.71
+++ ufs/ufs_extern.h 22 Nov 2012 17:27:28 -0000
@@ -144,6 +144,7 @@ int ufs_blkatoff(struct vnode *, off_t,
* Flags to chkdq() and chkiq()
*/
#define FORCE 0x01 /* force usage changes independent of limits */
+#define TRYONLY 0x02 /* check limits but don't update usage */
void ufsquota_init(struct inode *);
void ufsquota_free(struct inode *);
int chkdq(struct inode *, int64_t, kauth_cred_t, int);
Index: ufs/ufs_quota.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota.c,v
retrieving revision 1.108.2.2
diff -u -p -r1.108.2.2 ufs_quota.c
--- ufs/ufs_quota.c 13 Sep 2012 22:24:27 -0000 1.108.2.2
+++ ufs/ufs_quota.c 22 Nov 2012 17:27:28 -0000
@@ -154,6 +154,7 @@ chkdq(struct inode *ip, int64_t change,
int
chkiq(struct inode *ip, int32_t change, kauth_cred_t cred, int flags)
{
+ KASSERT((flags & TRYONLY) == 0);
/* do not track snapshot usage, or we will deadlock */
if ((ip->i_flags & SF_SNAPSHOT) != 0)
return 0;
Index: ufs/ufs_quota1.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota1.c,v
retrieving revision 1.18
diff -u -p -r1.18 ufs_quota1.c
--- ufs/ufs_quota1.c 2 Feb 2012 03:00:48 -0000 1.18
+++ ufs/ufs_quota1.c 22 Nov 2012 17:27:28 -0000
@@ -71,6 +71,8 @@ chkdq1(struct inode *ip, int64_t change,
if (change == 0)
return (0);
if (change < 0) {
+ if (flags & TRYONLY)
+ return (0);
for (i = 0; i < MAXQUOTAS; i++) {
if ((dq = ip->i_dquot[i]) == NODQUOT)
continue;
@@ -100,6 +102,8 @@ chkdq1(struct inode *ip, int64_t change,
return (error);
}
}
+ if (flags & TRYONLY)
+ return (0);
for (i = 0; i < MAXQUOTAS; i++) {
if ((dq = ip->i_dquot[i]) == NODQUOT)
continue;
Index: ufs/ufs_quota2.c
===================================================================
RCS file: /cvsroot/src/sys/ufs/ufs/ufs_quota2.c,v
retrieving revision 1.34.2.1
diff -u -p -r1.34.2.1 ufs_quota2.c
--- ufs/ufs_quota2.c 1 Oct 2012 19:55:22 -0000 1.34.2.1
+++ ufs/ufs_quota2.c 22 Nov 2012 17:27:28 -0000
@@ -465,6 +465,8 @@ quota2_check(struct inode *ip, int vtype
return 0;
}
if (change < 0) {
+ if (flags & TRYONLY)
+ return (0);
for (i = 0; i < MAXQUOTAS; i++) {
dq = ip->i_dquot[i];
if (dq == NODQUOT)
@@ -551,7 +553,7 @@ quota2_check(struct inode *ip, int vtype
if (dq == NODQUOT)
continue;
KASSERT(q2e[i] != NULL);
- if (error == 0) {
+ if (error == 0 && (flags & TRYONLY) == 0) {
q2vp = &q2e[i]->q2e_val[vtype];
ncurblks = ufs_rw64(q2vp->q2v_cur, needswap);
q2vp->q2v_cur = ufs_rw64(ncurblks + change, needswap);
Home |
Main Index |
Thread Index |
Old Index