Increase on-disk inode size only after data have been synced. WARNING: THIS DRAFT PATCH IS CURRENTLY BROKEN AND WILL LIKELY CORRUPT YOUR FILE SYSTEM BEYOND FSCKABLE RECOGNITION! CAVEAT LUSER! ALSO, NO LFS OR EXT2FS CHANGES HERE! - Old invariant: DIP(ip, size) == ip->i_size - New invariant: DIP(ip, size) <= ip->i_size - New invariant: if type != VREG, then DIP(ip, size) == ip->i_size - Update DIP(ip, size) wherever we put pages <= DIP(ip, size). Goal is to plug the garbage-data-appended-after-write bug by never increasing file size until data have hit disk, using the syncer's calls to VOP_FSYNC to be notified of that, without requiring any complex partial ordering memory barrier cruft hooked into uvm/ubc. Inspired by conversation with maya@. Index: sys/ufs/ffs/ffs_balloc.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ffs/ffs_balloc.c,v retrieving revision 1.62 diff -p -u -r1.62 ffs_balloc.c --- sys/ufs/ffs/ffs_balloc.c 25 Sep 2016 11:45:39 -0000 1.62 +++ sys/ufs/ffs/ffs_balloc.c 13 Mar 2017 01:28:09 -0000 @@ -145,8 +145,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t if (error) return (error); ip->i_size = ffs_lblktosize(fs, nb + 1); - ip->i_ffs1_size = ip->i_size; - uvm_vnp_setsize(vp, ip->i_ffs1_size); + KASSERT(ip->i_ffs1_size <= ip->i_size); + uvm_vnp_setsize(vp, ip->i_size); ip->i_ffs1_db[nb] = ufs_rw32((u_int32_t)newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; if (bpp && *bpp) { @@ -666,7 +666,7 @@ ffs_balloc_ufs2(struct vnode *vp, off_t if (error) return (error); ip->i_size = ffs_lblktosize(fs, nb + 1); - ip->i_ffs2_size = ip->i_size; + KASSERT(ip->i_ffs2_size <= ip->i_size); uvm_vnp_setsize(vp, ip->i_size); ip->i_ffs2_db[nb] = ufs_rw64(newb, needswap); ip->i_flag |= IN_CHANGE | IN_UPDATE; Index: sys/ufs/ffs/ffs_inode.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ffs/ffs_inode.c,v retrieving revision 1.123 diff -p -u -r1.123 ffs_inode.c --- sys/ufs/ffs/ffs_inode.c 11 Nov 2016 10:50:16 -0000 1.123 +++ sys/ufs/ffs/ffs_inode.c 13 Mar 2017 01:28:09 -0000 @@ -215,7 +215,7 @@ ffs_truncate(struct vnode *ovp, off_t le int64_t blocksreleased = 0; int i, aflag, nblocks; int error, allerror = 0; - off_t osize; + off_t osize, osyncsize, synclength; int sync; struct ufsmount *ump = oip->i_ump; void *dcookie; @@ -255,6 +255,7 @@ ffs_truncate(struct vnode *ovp, off_t le ffs_snapremove(ovp); osize = oip->i_size; + osyncsize = DIP(oip, size); aflag = ioflag & IO_SYNC ? B_SYNC : 0; /* @@ -284,17 +285,32 @@ ffs_truncate(struct vnode *ovp, off_t le trunc_page(osize & fs->fs_bmask), round_page(eob), PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); + /* + * If we had already synced everything + * up to the pages we just put, we have + * now synced everything including + * those pages. + */ + if (osyncsize >= + trunc_page(osize & fs->fs_bmask)) { + DIP_ASSIGN(oip, size, osize); + } } } uvm_vnp_setwritesize(ovp, length); error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag); if (error) { + /* Restore synced size. */ + DIP_ASSIGN(oip, size, osyncsize); + /* Truncate any blocks we had allocated. */ (void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred); + KASSERT(DIP(ip, size) <= oip->i_size); return (error); } uvm_vnp_setsize(ovp, length); oip->i_flag |= IN_CHANGE | IN_UPDATE; KASSERT(ovp->v_size == oip->i_size); + KASSERT(DIP(oip, size) <= oip->i_size); return (ffs_update(ovp, NULL, NULL, 0)); } @@ -343,7 +359,8 @@ ffs_truncate(struct vnode *ovp, off_t le genfs_node_wrlock(ovp); oip->i_size = length; - DIP_ASSIGN(oip, size, length); + synclength = MIN(DIP(oip, size), oip->i_size); + DIP_ASSIGN(oip, size, synclength); uvm_vnp_setsize(ovp, length); /* * Calculate index into inode's block list of @@ -403,7 +420,7 @@ ffs_truncate(struct vnode *ovp, off_t le } oip->i_size = osize; - DIP_ASSIGN(oip, size, osize); + DIP_ASSIGN(oip, size, MIN(DIP(oip, size), oip->i_size)); error = vtruncbuf(ovp, lastblock + 1, 0, 0); if (error && !allerror) allerror = error; @@ -501,7 +518,7 @@ ffs_truncate(struct vnode *ovp, off_t le */ oldspace = ffs_blksize(fs, oip, lastblock); oip->i_size = length; - DIP_ASSIGN(oip, size, length); + DIP_ASSIGN(oip, size, MIN(DIP(oip, size), oip->i_size)); newspace = ffs_blksize(fs, oip, lastblock); if (newspace == 0) panic("itrunc: newspace"); @@ -554,8 +571,9 @@ out: /* * Put back the real size. */ + KASSERT(synclength <= length); oip->i_size = length; - DIP_ASSIGN(oip, size, length); + DIP_ASSIGN(oip, size, synclength); DIP_ADD(oip, blocks, -blocksreleased); genfs_node_unlock(ovp); oip->i_flag |= IN_CHANGE; Index: sys/ufs/ffs/ffs_vnops.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ffs/ffs_vnops.c,v retrieving revision 1.125 diff -p -u -r1.125 ffs_vnops.c --- sys/ufs/ffs/ffs_vnops.c 25 Jul 2014 08:20:53 -0000 1.125 +++ sys/ufs/ffs/ffs_vnops.c 13 Mar 2017 01:28:10 -0000 @@ -371,6 +371,16 @@ ffs_fsync(void *v) goto out; } + /* + * If we changed the longest synced prefix, update the synced + * size of the inode and prepare to commit it to disk. + */ + if (trunc_page(ap->a_offlo) <= DIP(ip, size)) { + struct inode *ip = VTOI(vp); + KASSERT(DIP(ip, size) <= ip->i_size); + DIP_ASSIGN(ip, size, MIN(ip->i_size, round_page(ap->a_offhi))); + } + #ifdef WAPBL KASSERT(vp->v_type == VREG); if (mp->mnt_wapbl) { @@ -542,6 +552,16 @@ ffs_full_fsync(struct vnode *vp, int fla kauth_cred_get()); } + /* + * If we successfully synced everything to disk, commit the + * full size to disk. + */ + if (error == 0) { + struct inode *ip = VTOI(vp); + KASSERT(DIP(ip, size) <= ip->i_size); + DIP_ASSIGN(ip, size, ip->i_size); + } + return error; } Index: sys/ufs/ufs/ufs_readwrite.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ufs/ufs_readwrite.c,v retrieving revision 1.120 diff -p -u -r1.120 ufs_readwrite.c --- sys/ufs/ufs/ufs_readwrite.c 12 Apr 2015 22:48:38 -0000 1.120 +++ sys/ufs/ufs/ufs_readwrite.c 13 Mar 2017 01:28:10 -0000 @@ -388,10 +388,23 @@ WRITE(void *v) if (error) goto out; if (flags & B_SYNC) { + const off_t put_start = + trunc_page(osize & fs->fs_bmask); + const off_t put_end = round_page(eob); + mutex_enter(vp->v_interlock); - VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask), - round_page(eob), + error = VOP_PUTPAGES(vp, put_start, put_end, PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); + /* + * XXX This historically never failed on + * VOP_PUTPAGES error. Why? No idea! + * + * If we had already synced up to the point we + * just put, then we have synced to the end of + * what we just put. + */ + if (error == 0 && put_start <= DIP(ip, size)) + DIP_ASSIGN(ip, size, MIN(ip->i_size, put_end)); } } @@ -482,20 +495,43 @@ WRITE(void *v) #ifndef LFS_READWRITE if (!async && oldoff >> 16 != uio->uio_offset >> 16) { + const off_t put_start = (oldoff >> 16) << 16; + const off_t put_end = (uio->uio_offset >> 16) << 16; + mutex_enter(vp->v_interlock); - error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16, - (uio->uio_offset >> 16) << 16, + error = VOP_PUTPAGES(vp, put_start, put_end, PGO_CLEANIT | PGO_JOURNALLOCKED | PGO_LAZY); if (error) break; + + /* + * If we had already synced up to the point we + * just put, then we have synced to the end of + * what we just put. + */ + if (put_start <= DIP(ip, size)) + DIP_ASSIGN(ip, size, MIN(ip->i_size, put_end)); } #endif } if (error == 0 && ioflag & IO_SYNC) { + const off_t put_start = trunc_page(origoff & fs->fs_bmask) + const off_t put_end = + round_page(ufs_blkroundup(fs, uio->uio_offset)); + mutex_enter(vp->v_interlock); - error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask), - round_page(ufs_blkroundup(fs, uio->uio_offset)), + error = VOP_PUTPAGES(vp, put_start, put_end, PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED); + if (error) + goto out; + + /* + * If we had already synced up to the point we just + * put, then we have synced to the end of what we just + * put. + */ + if (put_start <= DIP(ip, size)) + DIP_ASSIGN(ip, size, MIN(ip->i_size, put_end)); } out: Index: sys/ufs/ufs/ufs_vnops.c =================================================================== RCS file: /cvsroot/src/sys/ufs/ufs/ufs_vnops.c,v retrieving revision 1.234 diff -p -u -r1.234 ufs_vnops.c --- sys/ufs/ufs/ufs_vnops.c 9 Nov 2016 04:12:55 -0000 1.234 +++ sys/ufs/ufs/ufs_vnops.c 13 Mar 2017 01:28:10 -0000 @@ -1875,7 +1875,7 @@ ufs_gop_alloc(struct vnode *vp, off_t of UVMHIST_LOG(ubchist, "vp %p old 0x%x new 0x%x", vp, ip->i_size, off + bsize, 0); ip->i_size = off + bsize; - DIP_ASSIGN(ip, size, ip->i_size); + KASSERT(DIP(ip, size) <= ip->i_size); } off += bsize;