Index: distrib/sets/lists/comp/mi =================================================================== RCS file: /cvsroot/src/distrib/sets/lists/comp/mi,v retrieving revision 1.1117 diff -d -p -u -r1.1117 mi --- distrib/sets/lists/comp/mi 2 Mar 2008 00:30:21 -0000 1.1117 +++ distrib/sets/lists/comp/mi 2 Mar 2008 10:19:47 -0000 @@ -1975,6 +1975,7 @@ ./usr/include/sys/vnode_if.h comp-c-include ./usr/include/sys/vsio.h comp-obsolete obsolete ./usr/include/sys/wait.h comp-c-include +./usr/include/sys/wapbl.h comp-c-include ./usr/include/sys/wdog.h comp-c-include ./usr/include/sysexits.h comp-c-include ./usr/include/syslog.h comp-c-include Index: distrib/utils/sysinst/defs.h =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/defs.h,v retrieving revision 1.135 diff -d -p -u -r1.135 defs.h --- distrib/utils/sysinst/defs.h 28 Jan 2008 02:47:12 -0000 1.135 +++ distrib/utils/sysinst/defs.h 2 Mar 2008 10:20:09 -0000 @@ -177,6 +177,7 @@ typedef struct _partinfo { #define pi_cpg pi_partition.p_cpg char pi_mount[20]; uint pi_isize; /* bytes per inode (for # inodes) */ + uint pi_jsize; /* journal size, in blocks */ uint pi_flags; #define PIF_NEWFS 0x0001 /* need to 'newfs' partition */ #define PIF_FFSv2 0x0002 /* newfs with FFSv2, not FFSv1 */ @@ -188,6 +189,7 @@ typedef struct _partinfo { #define PIF_NOEXEC 0x0100 /* mount -o noexec */ #define PIF_NOSUID 0x0200 /* mount -o nosuid */ #define PIF_SOFTDEP 0x0400 /* mount -o softdep */ +#define PIF_LOG 0x0800 /* mount -o log */ #define PIF_MOUNT_OPTS 0x0ff0 /* all above mount flags */ #define PIF_RESET 0x1000 /* internal - restore previous values */ } partinfo; /* Single partition from a disklabel */ @@ -349,6 +351,7 @@ int incorelabel(const char *, partinfo * int edit_and_check_label(partinfo *, int, int, int); int getpartoff(int); int getpartsize(int, int); +int getjsize(int, int); void set_bsize(partinfo *, int); void set_fsize(partinfo *, int); void set_ptype(partinfo *, int, int); Index: distrib/utils/sysinst/disks.c =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/disks.c,v retrieving revision 1.99 diff -d -p -u -r1.99 disks.c --- distrib/utils/sysinst/disks.c 2 Feb 2008 09:26:45 -0000 1.99 +++ distrib/utils/sysinst/disks.c 2 Mar 2008 10:20:13 -0000 @@ -344,10 +344,11 @@ make_filesystems(void) break; case FS_BSDFFS: asprintf(&newfs, - "/sbin/newfs -V2 -O %d -b %d -f %d%s%.0d", + "/sbin/newfs -V2 -O %d -b %d -f %d%s%.0d %s%.0d", lbl->pi_flags & PIF_FFSv2 ? 2 : 1, lbl->pi_fsize * lbl->pi_frag, lbl->pi_fsize, - lbl->pi_isize != 0 ? " -i " : "", lbl->pi_isize); + lbl->pi_isize != 0 ? " -i " : "", lbl->pi_isize, + lbl->pi_jsize != 0 ? " -s " : "", -lbl->pi_jsize); mnt_opts = "-tffs -o async"; fsname = "ffs"; break; @@ -476,6 +477,8 @@ make_fstab(void) /* FALLTHROUGH */ case FS_BSDFFS: fsck_pass = (strcmp(mp, "/") == 0) ? 1 : 2; + if (bsdlabel[i].pi_flags & PIF_LOG) + fsck_pass = 0; dump_freq = 1; break; case FS_MSDOS: @@ -502,7 +505,7 @@ make_fstab(void) if (strcmp(mp, "/") == 0 && !(bsdlabel[i].pi_flags & PIF_MOUNT)) s = "# "; - scripting_fprintf(f, "%s/dev/%s%c\t\t%s\t%s\trw%s%s%s%s%s%s%s%s\t\t %d %d\n", + scripting_fprintf(f, "%s/dev/%s%c\t\t%s\t%s\trw%s%s%s%s%s%s%s%s%s\t\t %d %d\n", s, diskdev, 'a' + i, mp, fstype, bsdlabel[i].pi_flags & PIF_MOUNT ? "" : ",noauto", bsdlabel[i].pi_flags & PIF_ASYNC ? ",async" : "", @@ -512,6 +515,7 @@ make_fstab(void) bsdlabel[i].pi_flags & PIF_NOEXEC ? ",noexec" : "", bsdlabel[i].pi_flags & PIF_NOSUID ? ",nosuid" : "", bsdlabel[i].pi_flags & PIF_SOFTDEP ? ",softdep" : "", + bsdlabel[i].pi_flags & PIF_LOG ? ",log" : "", dump_freq, fsck_pass); } Index: distrib/utils/sysinst/label.c =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/label.c,v retrieving revision 1.51 diff -d -p -u -r1.51 label.c --- distrib/utils/sysinst/label.c 23 Oct 2006 19:44:57 -0000 1.51 +++ distrib/utils/sysinst/label.c 2 Mar 2008 10:20:23 -0000 @@ -250,6 +250,18 @@ set_fsize(partinfo *p, int fsize) } static int +edit_fs_jsize(menudesc *m, void *arg) +{ + partinfo *p = arg; + int size; + + size = getjsize(p->pi_size, p->pi_jsize); + if (size != -1) + p->pi_jsize = size; + return 0; +} + +static int edit_fs_isize(menudesc *m, void *arg) { partinfo *p = arg; @@ -351,17 +363,19 @@ edit_ptn(menudesc *menu, void *arg) {NULL, OPT_NOMENU, OPT_IGNORE, NULL}, /* displays 'end' */ #define PTN_MENU_NEWFS 4 {NULL, OPT_NOMENU, 0, edit_fs_preserve}, -#define PTN_MENU_ISIZE 5 +#define PTN_MENU_JSIZE 5 + {NULL, OPT_NOMENU, 0, edit_fs_jsize}, +#define PTN_MENU_ISIZE 6 {NULL, OPT_NOMENU, 0, edit_fs_isize}, -#define PTN_MENU_BSIZE 6 +#define PTN_MENU_BSIZE 7 {NULL, MENU_selbsize, OPT_SUB, NULL}, -#define PTN_MENU_FSIZE 7 +#define PTN_MENU_FSIZE 8 {NULL, MENU_selfsize, OPT_SUB, NULL}, -#define PTN_MENU_MOUNT 8 +#define PTN_MENU_MOUNT 9 {NULL, OPT_NOMENU, 0, edit_fs_mount}, -#define PTN_MENU_MOUNTOPT 9 +#define PTN_MENU_MOUNTOPT 10 {NULL, MENU_mountoptions, OPT_SUB, NULL}, -#define PTN_MENU_MOUNTPT 10 +#define PTN_MENU_MOUNTPT 11 {NULL, OPT_NOMENU, 0, edit_fs_mountpt}, {MSG_askunits, MENU_sizechoice, OPT_SUB, NULL}, {MSG_restore, OPT_NOMENU, 0, edit_restore}, @@ -447,6 +461,10 @@ set_ptn_header(menudesc *m, void *arg) /* LFS doesn't have fragments */ continue; } + /* Only FFS can have a journal right now */ + if ((i == PTN_MENU_JSIZE) && + (!(p->pi_flags & PIF_NEWFS) || (t != FS_BSDFFS))) + continue; /* Ok: we want this one */ m->opts[i].opt_flags &= ~OPT_IGNORE; } @@ -497,6 +515,9 @@ set_ptn_label(menudesc *m, int opt, void wprintw(m->mw, msg_string(MSG_newfs_fmt), msg_string(p->pi_flags & PIF_NEWFS ? MSG_Yes : MSG_No)); break; + case PTN_MENU_JSIZE: + disp_sector_count(m, MSG_jsize_fmt, p->pi_jsize); + break; case PTN_MENU_ISIZE: wprintw(m->mw, msg_string(p->pi_isize > 0 ? MSG_isize_fmt : MSG_isize_fmt_dflt), p->pi_isize); @@ -528,6 +549,8 @@ set_ptn_label(menudesc *m, int opt, void wprintw(m->mw, "nosuid "); if (p->pi_flags & PIF_SOFTDEP) wprintw(m->mw, "softdep "); + if (p->pi_flags & PIF_LOG) + wprintw(m->mw, "log"); break; case PTN_MENU_MOUNTPT: wprintw(m->mw, msg_string(MSG_mountpt_fmt), p->pi_mount); @@ -881,6 +904,43 @@ getpartsize(int partstart, int defpartsi } /* NOTREACHED */ } + +/* Ask for a partition offset, check bounds and do the needed roundups */ +int +getjsize(int partsize, int defjsize) +{ + char defsize[20], isize[20]; + int i, localsizemult; + const char *errmsg = "\n"; + int min, max; + + /* size between 4mb and 10% of disk */ + min = NUMSEC(4*MEG/sectorsize/sizemult, sizemult, dlcylsize); + max = NUMSEC(partsize/10/sizemult, sizemult, dlcylsize); + if (min > max) + min = max = 0; + + for (;;) { + snprintf(defsize, sizeof defsize, "%d", defjsize/sizemult); + msg_prompt_win(MSG_journal_size, -1, 12, 70, 7, + (defjsize != 0) ? defsize : 0, isize, sizeof isize, + errmsg, min/sizemult, multname, + max/sizemult, multname, multname); + if (strcmp(defsize, isize) == 0) + /* Don't do rounding if default accepted */ + return defjsize; + atofsb(isize, &i, &localsizemult); + /* round to cylinder size if localsizemult != 1 */ + if (i > 0) + i = NUMSEC(i/localsizemult, localsizemult, dlcylsize); + if (i == 0) + break; + if ((i >= min) && (i <= max)) + break; + errmsg = msg_string(MSG_invalid_journal_size); + } + return i; +} /* * convert a string to a number of sectors, with a possible unit Index: distrib/utils/sysinst/menus.mi =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/menus.mi,v retrieving revision 1.31 diff -d -p -u -r1.31 menus.mi --- distrib/utils/sysinst/menus.mi 4 Feb 2008 01:54:55 -0000 1.31 +++ distrib/utils/sysinst/menus.mi 2 Mar 2008 10:20:23 -0000 @@ -152,6 +152,8 @@ menu mountoptions, title MSG_toggle, y=5 { ((partinfo *)arg)->pi_flags ^= PIF_NOSUID; }; option "softdep", exit, action { ((partinfo *)arg)->pi_flags ^= PIF_SOFTDEP; }; + option "log", exit, action + { ((partinfo *)arg)->pi_flags ^= PIF_LOG; }; menu netbsd, title MSG_NetBSD_VERSION_Install_System, y=-1, exit, exitstring MSG_Exit_Install_System; Index: distrib/utils/sysinst/msg.mi.de =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/msg.mi.de,v retrieving revision 1.42 diff -d -p -u -r1.42 msg.mi.de --- distrib/utils/sysinst/msg.mi.de 4 Feb 2008 01:54:55 -0000 1.42 +++ distrib/utils/sysinst/msg.mi.de 2 Mar 2008 10:20:30 -0000 @@ -288,6 +288,9 @@ message bsize_fmt message fsize_fmt {Fragmentgröße: %9d bytes} +message jsize_fmt +{ Journalgröße: %9u %8u%c %9u} + message isize_fmt { Durchschnittliche Dateigröße: %d Bytes (zur Inode-Bestimmung) } message isize_fmt_dflt @@ -337,12 +340,25 @@ message invalid_sector_number {Ungültige Sektornummer } +message journal_size +{%s +(>= %d %s && <= %d %s) || 0 + +journalgröße (%s)} + +message invalid_journal_size +{Ungültige journalgröße +} + message Select_file_system_block_size {Wählen Sie die Blockgröße des Dateisystems aus} message Select_file_system_fragment_size {Wählen Sie die Fragmentgröße des Dateisystems aus} +message Select_file_system_journal_size +{Wählen Sie die Journalgröße des Dateisystems aus} + message packname {Bitte geben Sie Ihrer NetBSD-Festplatte einen Namen} Index: distrib/utils/sysinst/msg.mi.en =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/msg.mi.en,v retrieving revision 1.149 diff -d -p -u -r1.149 msg.mi.en --- distrib/utils/sysinst/msg.mi.en 4 Feb 2008 01:54:56 -0000 1.149 +++ distrib/utils/sysinst/msg.mi.en 2 Mar 2008 10:20:36 -0000 @@ -281,6 +281,9 @@ message bsize_fmt message fsize_fmt { fragment size: %9d bytes} +message jsize_fmt +{ journal size: %9u %8u%c %9u} + message isize_fmt { avg file size: %9d bytes (for number of inodes)} message isize_fmt_dflt @@ -330,12 +333,25 @@ message invalid_sector_number {Badly formed sector number } +message journal_size +{%s + Valid sizes are in the range %d %s to %d %s, or 0. + +journal size (%s)} + +message invalid_journal_size +{Badly formed journal size +} + message Select_file_system_block_size {Select file system block size} message Select_file_system_fragment_size {Select file system fragment size} +message Select_file_system_journal_size +{Select file system journal size} + message packname {Please enter a name for your NetBSD disk} Index: distrib/utils/sysinst/msg.mi.es =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/msg.mi.es,v retrieving revision 1.21 diff -d -p -u -r1.21 msg.mi.es --- distrib/utils/sysinst/msg.mi.es 4 Feb 2008 01:54:56 -0000 1.21 +++ distrib/utils/sysinst/msg.mi.es 2 Mar 2008 10:20:38 -0000 @@ -286,6 +286,9 @@ message bsize_fmt message fsize_fmt { tamaño frag: %9d bytes} +message jsize_fmt +{ tamaño del diario: %9u %8u%c %9u} + message isize_fmt {tam prom archi: %9d bytes (para número de inodos)} message isize_fmt_dflt @@ -335,12 +338,25 @@ message invalid_sector_number {Número de sector mal formado } +message journal_size +{%s + Los tamaños válidos están en la gama %d %s a %d %s, o 0. + +tamaño del diario (%s)} + +message invalid_journal_size +{Tamaño gravemente formado del diario +} + message Select_file_system_block_size {Seleccione el tamaño de bloque del sistema de archivos} message Select_file_system_fragment_size {Seleccione el tamaño de fragmento del sistema de archivos} +message Select_file_system_journal_size +{Seleccione el tamaño del diario del sistema de ficheros} + message packname {Por favor entroduzca un nombre para el disco NetBSD} Index: distrib/utils/sysinst/msg.mi.fr =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/msg.mi.fr,v retrieving revision 1.101 diff -d -p -u -r1.101 msg.mi.fr --- distrib/utils/sysinst/msg.mi.fr 4 Feb 2008 01:54:56 -0000 1.101 +++ distrib/utils/sysinst/msg.mi.fr 2 Mar 2008 10:20:41 -0000 @@ -286,6 +286,9 @@ message bsize_fmt message fsize_fmt { taille de fragment: %9d bytes} +message jsize_fmt +{ taille de journal: %9u %8u%c %9u} + message isize_fmt { taille moyenne de fichier: %9d bytes} message isize_fmt_dflt @@ -335,12 +338,25 @@ message invalid_sector_number {nombre de secteurs invalide } +message journal_size +{%s +(>= %d %s && <= %d %s) || 0 + +taille de journal (%s)} + +message invalid_journal_size +{taille de journal mal indiqué +} + message Select_file_system_block_size {Selectionnez la taille de bloc du système de fichiers} message Select_file_system_fragment_size {Selectionnez la taille de fragment du système de fichiers} +message Select_file_system_journal_size +{Selectionnez la taille de journal du système de fichier} + message packname {Veuillez donner un nom à votre disque.} Index: distrib/utils/sysinst/msg.mi.pl =================================================================== RCS file: /cvsroot/src/distrib/utils/sysinst/msg.mi.pl,v retrieving revision 1.60 diff -d -p -u -r1.60 msg.mi.pl --- distrib/utils/sysinst/msg.mi.pl 4 Feb 2008 01:54:56 -0000 1.60 +++ distrib/utils/sysinst/msg.mi.pl 2 Mar 2008 10:20:43 -0000 @@ -277,6 +277,9 @@ message bsize_fmt message fsize_fmt { rozmiar fragmentu: %9d bajtow} +message jsize_fmt +{ rozmiar dziennika: %9u %8u%c %9u} + message isize_fmt { Sredni rozm. pliku: %9d bajtow} message isize_fmt_dflt @@ -326,12 +329,25 @@ message invalid_sector_number {Zle uformowany numer sektora } +message journal_size +{%s +(>= %d %s && <= %d %s) || 0 + +rozmiar dziennika (%s)} + +message invalid_journal_size +{Zle uformowany rozmiar dziennika +} + message Select_file_system_block_size {Wybierz rozmiar bloku dla systemu plikow} message Select_file_system_fragment_size {Wybierz rozmiar fragmentu dla systemu plikow} +message Select_file_system_journal_size +{Wybierz rozmiar dziennika dla systemu plikow} + message packname {Podaj nazwe dla swojego dysku NetBSD} Index: include/mntopts.h =================================================================== RCS file: /cvsroot/src/include/mntopts.h,v retrieving revision 1.10 diff -d -p -u -r1.10 mntopts.h --- include/mntopts.h 31 Oct 2006 08:12:46 -0000 1.10 +++ include/mntopts.h 2 Mar 2008 10:24:48 -0000 @@ -55,6 +55,7 @@ struct mntopt { #define MOPT_NOATIME { "atime", 1, MNT_NOATIME, 0 } #define MOPT_SYMPERM { "symperm", 0, MNT_SYMPERM, 0 } #define MOPT_SOFTDEP { "softdep", 0, MNT_SOFTDEP, 0 } +#define MOPT_LOG { "log", 0, MNT_LOG, 0 } #define MOPT_IGNORE { "hidden", 0, MNT_IGNORE, 0 } /* Control flags. */ Index: sbin/fsck_ffs/Makefile =================================================================== RCS file: /cvsroot/src/sbin/fsck_ffs/Makefile,v retrieving revision 1.34 diff -d -p -u -r1.34 Makefile --- sbin/fsck_ffs/Makefile 9 Feb 2008 02:37:22 -0000 1.34 +++ sbin/fsck_ffs/Makefile 2 Mar 2008 10:25:17 -0000 @@ -19,6 +19,10 @@ SRCS+= progress.c .PATH: ${NETBSDSRCDIR}/sys/ufs/ffs ${FSCK} +SRCS+= vfs_wapbl.c wapbl.c +.PATH: ${NETBSDSRCDIR}/sys/kern +CPPFLAGS+=-DWAPBL_DEBUG_PRINT=0 + LDADD+=-lutil DPADD+=${LIBUTIL} Index: sbin/fsck_ffs/extern.h =================================================================== RCS file: /cvsroot/src/sbin/fsck_ffs/extern.h,v retrieving revision 1.22 diff -d -p -u -r1.22 extern.h --- sbin/fsck_ffs/extern.h 27 Jun 2005 01:25:35 -0000 1.22 +++ sbin/fsck_ffs/extern.h 2 Mar 2008 10:25:17 -0000 @@ -82,7 +82,11 @@ void setinodebuf(ino_t); int setup(const char *); void voidquit(int); -void swap_cg(struct cg *, struct cg *); -void copyback_cg(struct bufarea *); -void sb_oldfscompat_write(struct fs *, struct fs *); -void sb_oldfscompat_read(struct fs *, struct fs **); +void replay_wapbl(void); +void cleanup_wapbl(void); +int read_wapbl(char *, long, daddr_t); + +void swap_cg(struct cg *, struct cg *); +void copyback_cg(struct bufarea *); +void sb_oldfscompat_write(struct fs *, struct fs *); +void sb_oldfscompat_read(struct fs *, struct fs **); Index: sbin/fsck_ffs/setup.c =================================================================== RCS file: /cvsroot/src/sbin/fsck_ffs/setup.c,v retrieving revision 1.82 diff -d -p -u -r1.82 setup.c --- sbin/fsck_ffs/setup.c 23 Feb 2008 21:41:48 -0000 1.82 +++ sbin/fsck_ffs/setup.c 2 Mar 2008 10:25:19 -0000 @@ -159,6 +159,25 @@ setup(const char *dev) doskipclean = 0; pwarn("USING ALTERNATE SUPERBLOCK AT %d\n", bflag); } + if (sblock->fs_flags & FS_DOWAPBL) { + if (preen) { + if (!quiet) + pwarn("file system is journaled; not checking\n"); + return (-1); + } + if (!quiet) + pwarn("** File system is journaled; replaying journal\n"); + replay_wapbl(); + doskipclean = 0; + sblock->fs_flags &= ~FS_DOWAPBL; + sbdirty(); + /* Although we may have updated the superblock from the + * journal, we are still going to do a full check, so we + * don't bother to re-read the superblock from the journal. + * XXX, instead we could re-read the superblock and then not + * force doskipclean = 0 + */ + } if (debug) printf("clean = %d\n", sblock->fs_clean); if (doswap) @@ -218,6 +237,13 @@ setup(const char *dev) /* * Check and potentially fix certain fields in the super block. */ + if (sblock->fs_flags & ~(FS_KNOWN_FLAGS)) { + pfatal("UNKNOWN FLAGS=0x%08x IN SUPERBLOCK", sblock->fs_flags); + if (reply("CLEAR") == 1) { + sblock->fs_flags &= FS_KNOWN_FLAGS; + sbdirty(); + } + } if (sblock->fs_optim != FS_OPTTIME && sblock->fs_optim != FS_OPTSPACE) { pfatal("UNDEFINED OPTIMIZATION IN SUPERBLOCK"); if (reply("SET TO DEFAULT") == 1) { Index: sbin/fsck_ffs/utilities.c =================================================================== RCS file: /cvsroot/src/sbin/fsck_ffs/utilities.c,v retrieving revision 1.55 diff -d -p -u -r1.55 utilities.c --- sbin/fsck_ffs/utilities.c 23 Feb 2008 21:41:48 -0000 1.55 +++ sbin/fsck_ffs/utilities.c 2 Mar 2008 10:25:21 -0000 @@ -322,6 +322,7 @@ ckfini(void) if (debug) printf("cache missed %ld of %ld (%d%%)\n", diskreads, totalreads, (int)(diskreads * 100 / totalreads)); + cleanup_wapbl(); (void)close(fsreadfd); (void)close(fswritefd); } @@ -335,7 +336,8 @@ bread(int fd, char *buf, daddr_t blk, lo offset = blk; offset *= dev_bsize; - if (pread(fd, buf, (int)size, offset) == size) + if ((pread(fd, buf, (int)size, offset) == size) && + read_wapbl(buf, size, blk) == 0) return (0); rwerror("READ", blk); errs = 0; Index: sbin/fsck_ffs/wapbl.c =================================================================== RCS file: sbin/fsck_ffs/wapbl.c diff -N sbin/fsck_ffs/wapbl.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sbin/fsck_ffs/wapbl.c 2 Mar 2008 10:25:21 -0000 @@ -0,0 +1,155 @@ +/* $NetBSD: wapbl.c,v 1.4 2006/02/20 01:01:20 dbj Exp $ */ + +/*- + * Copyright (c) 2005,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* This file contains fsck support for wapbl + */ + +#include +__KERNEL_RCSID(0, "$NetBSD: wapbl.c,v 1.4 2006/02/20 01:01:20 dbj Exp $"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "fsck.h" +#include "fsutil.h" +#include "extern.h" +#include "exitvalues.h" + +int +wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + WAPBL_PRINTF(WAPBL_PRINT_IO, + ("wapbl_write: %zd bytes at block %"PRId64" on fd 0x%x\n", + len, pbn, fswritefd)); + bwrite(fswritefd, data, pbn, len); + return 0; +} + +int +wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + WAPBL_PRINTF(WAPBL_PRINT_IO, + ("wapbl_read: %zd bytes at block %"PRId64" on fd 0x%x\n", + len, pbn, fsreadfd)); + bread(fsreadfd, data, pbn, len); + return 0; +} + +struct wapbl_replay *wapbl_replay; + +void +replay_wapbl(void) +{ + int error; + + if (debug) + wapbl_debug_print = WAPBL_PRINT_ERROR|WAPBL_PRINT_REPLAY; + if (debug > 1) + wapbl_debug_print |= WAPBL_PRINT_IO; + error = wapbl_replay_start(&wapbl_replay, + 0, + fsbtodb(sblock, sblock->fs_size), /* journal is after filsystem */ + 0 /* XXX */, + dev_bsize); + if (error) { + pfatal("UNABLE TO READ JOURNAL FOR REPLAY"); + if (reply("CONTINUE") == 0) { + exit(FSCK_EXIT_CHECK_FAILED); + } + return; + } + if (!nflag) { + error = wapbl_replay_write(wapbl_replay, 0); + if (error) { + pfatal("UNABLE TO REPLAY JOURNAL BLOCKS"); + if (reply("CONTINUE") == 0) { + exit(FSCK_EXIT_CHECK_FAILED); + } + } else { + wapbl_replay_stop(wapbl_replay); + } + } + { + int i; + for (i = 0; i < wapbl_replay->wr_inodescnt; i++) { + WAPBL_PRINTF(WAPBL_PRINT_REPLAY,("wapbl_replay: not cleaning inode %"PRIu32" mode %"PRIo32"\n", + wapbl_replay->wr_inodes[i].wr_inumber, wapbl_replay->wr_inodes[i].wr_imode)); + } + } +} + +void +cleanup_wapbl(void) +{ + + if (wapbl_replay) { + if (wapbl_replay_isopen(wapbl_replay)) + wapbl_replay_stop(wapbl_replay); + wapbl_replay_free(wapbl_replay); + wapbl_replay = 0; + } +} + +int +read_wapbl(char *buf, long size, daddr_t blk) +{ + + if (!wapbl_replay || !wapbl_replay_isopen(wapbl_replay)) + return 0; + return wapbl_replay_read(wapbl_replay, buf, blk, size); +} Index: sbin/fsdb/Makefile =================================================================== RCS file: /cvsroot/src/sbin/fsdb/Makefile,v retrieving revision 1.21 diff -d -p -u -r1.21 Makefile --- sbin/fsdb/Makefile 9 Feb 2008 02:37:22 -0000 1.21 +++ sbin/fsdb/Makefile 2 Mar 2008 10:25:21 -0000 @@ -16,6 +16,10 @@ FSCK_FFS=${NETBSDSRCDIR}/sbin/fsck_ffs CPPFLAGS+= -I${FSCK} -I${FSCK_FFS} .PATH: ${FSCK} ${FSCK_FFS} ${NETBSDSRCDIR}/sys/ufs/ffs +SRCS+= vfs_wapbl.c wapbl.c +.PATH: ${NETBSDSRCDIR}/sys/kern +CPPFLAGS+=-DWAPBL_DEBUG_PRINT=0 + LDADD+= -lutil -ledit -ltermcap .ifndef HOSTPROG DPADD+= ${LIBUTIL} ${LIBEDIT} ${LIBTERMCAP} Index: sbin/mount_ffs/mount_ffs.c =================================================================== RCS file: /cvsroot/src/sbin/mount_ffs/mount_ffs.c,v retrieving revision 1.22 diff -d -p -u -r1.22 mount_ffs.c --- sbin/mount_ffs/mount_ffs.c 16 Jul 2007 17:06:53 -0000 1.22 +++ sbin/mount_ffs/mount_ffs.c 2 Mar 2008 10:25:22 -0000 @@ -70,6 +70,7 @@ static const struct mntopt mopts[] = { MOPT_NODEVMTIME, MOPT_FORCE, MOPT_SOFTDEP, + MOPT_LOG, MOPT_GETARGS, MOPT_NULL, }; Index: sys/arch/amd64/conf/GENERIC =================================================================== RCS file: /cvsroot/src/sys/arch/amd64/conf/GENERIC,v retrieving revision 1.200 diff -d -p -u -r1.200 GENERIC --- sys/arch/amd64/conf/GENERIC 29 Feb 2008 14:42:32 -0000 1.200 +++ sys/arch/amd64/conf/GENERIC 2 Mar 2008 10:25:38 -0000 @@ -157,6 +157,7 @@ file-system TMPFS # Efficient memory f options QUOTA # UFS quotas #options FFS_EI # FFS Endian Independent support options SOFTDEP # FFS soft updates support. +options WAPBL # file system journal support # Note that UFS_DIRHASH is suspected of causing kernel memory corruption. # It is not recommended for general use. #options UFS_DIRHASH # UFS Large Directory Hashing - Experimental Index: sys/arch/i386/conf/GENERIC =================================================================== RCS file: /cvsroot/src/sys/arch/i386/conf/GENERIC,v retrieving revision 1.882 diff -d -p -u -r1.882 GENERIC --- sys/arch/i386/conf/GENERIC 29 Feb 2008 14:42:32 -0000 1.882 +++ sys/arch/i386/conf/GENERIC 2 Mar 2008 10:26:05 -0000 @@ -198,6 +198,7 @@ file-system TMPFS # Efficient memory f options QUOTA # UFS quotas #options FFS_EI # FFS Endian Independent support options SOFTDEP # FFS soft updates support. +options WAPBL # Write Ahead Physical Block Logging # Note that UFS_DIRHASH is suspected of causing kernel memory corruption. # It is not recommended for general use. #options UFS_DIRHASH # UFS Large Directory Hashing - Experimental Index: sys/arch/i386/stand/boot/boot2.c =================================================================== RCS file: /cvsroot/src/sys/arch/i386/stand/boot/boot2.c,v retrieving revision 1.22 diff -d -p -u -r1.22 boot2.c --- sys/arch/i386/stand/boot/boot2.c 23 Feb 2008 17:49:29 -0000 1.22 +++ sys/arch/i386/stand/boot/boot2.c 2 Mar 2008 10:26:08 -0000 @@ -128,6 +128,7 @@ struct bootconf_def { char *desc[MAXMENU]; /* Menu text per entry */ int nummenu; /* Number of menu items */ int timeout; /* Timeout in seconds */ + int scroll; /* Number of blank lines to scroll */ } bootconf; #endif /* !SMALL */ @@ -235,7 +236,11 @@ void print_banner(void) { #ifndef SMALL - int n; + int i, n; + + if (bootconf.scroll > 0) + for (i = 0; i < bootconf.scroll; i++) + printf("\n"); if (bootconf.banner[0]) { for (n = 0; bootconf.banner[n] && n < MAXBANNER; n++) printf("%s\n", bootconf.banner[n]); @@ -279,6 +284,7 @@ atoi(const char *in) * The recognised keywords are: * banner: text displayed instead of the normal welcome text * menu: Descriptive text:command to use + * scroll: number of blank lines to print before banner * timeout: Timeout in seconds (overrides that set by installboot) * default: the default menu option to use if Return is pressed * consdev: the console device to use @@ -290,6 +296,7 @@ atoi(const char *in) * menu=Boot into single user mode:boot netbsd -s * menu=:boot hd1a:netbsd -cs * menu=Goto boot comand line:prompt + * scroll=24 * timeout=10 * consdev=com0 * default=1 @@ -408,6 +415,8 @@ parsebootconf(const char *conf) bootconf.timeout = atoi(value); } else if (!strncmp(key, "default", 7)) { bootconf.def = atoi(value) - 1; + } else if (!strncmp(key, "scroll", 6)) { + bootconf.scroll = atoi(value); } else if (!strncmp(key, "consdev", 7)) { bootconf.consdev = value; } Index: sys/conf/files =================================================================== RCS file: /cvsroot/src/sys/conf/files,v retrieving revision 1.893 diff -d -p -u -r1.893 files --- sys/conf/files 24 Feb 2008 05:29:31 -0000 1.893 +++ sys/conf/files 2 Mar 2008 10:26:41 -0000 @@ -110,6 +110,10 @@ defflag opt_fileassoc.h FILEASSOC defflag opt_gre.h GRE_DEBUG +# Write Ahead Physical Block Logging +defflag opt_wapbl.h WAPBL WAPBL_DEBUG +defparam opt_wapbl.h WAPBL_DEBUG_PRINT + # compatibility options # defflag opt_compat_netbsd.h COMPAT_40 @@ -1472,6 +1476,7 @@ file kern/vfs_subr2.c file kern/vfs_syscalls.c file kern/vfs_trans.c file kern/vfs_vnops.c +file kern/vfs_wapbl.c wapbl file kern/vfs_xattr.c file kern/vnode_if.c file miscfs/deadfs/dead_vnops.c Index: sys/kern/init_main.c =================================================================== RCS file: /cvsroot/src/sys/kern/init_main.c,v retrieving revision 1.341 diff -d -p -u -r1.341 init_main.c --- sys/kern/init_main.c 20 Jan 2008 18:09:11 -0000 1.341 +++ sys/kern/init_main.c 2 Mar 2008 10:26:56 -0000 @@ -82,6 +82,7 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c, #include "opt_fileassoc.h" #include "opt_ktrace.h" #include "opt_pax.h" +#include "opt_wapbl.h" #include "rnd.h" #include "sysmon_envsys.h" @@ -163,6 +164,9 @@ __KERNEL_RCSID(0, "$NetBSD: init_main.c, #include #endif #include +#ifdef WAPBL +#include +#endif #include #include @@ -529,6 +533,11 @@ main(void) /* Initialize the UUID system calls. */ uuid_init(); +#ifdef WAPBL + /* Initialize write-ahead physical block logging. */ + wapbl_init(); +#endif + /* * Create process 1 (init(8)). We do this now, as Unix has * historically had init be process 1, and changing this would Index: sys/kern/vfs_bio.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_bio.c,v retrieving revision 1.190 diff -d -p -u -r1.190 vfs_bio.c --- sys/kern/vfs_bio.c 29 Feb 2008 12:10:09 -0000 1.190 +++ sys/kern/vfs_bio.c 2 Mar 2008 10:27:00 -0000 @@ -1,11 +1,13 @@ /* $NetBSD: vfs_bio.c,v 1.190 2008/02/29 12:10:09 yamt Exp $ */ /*- - * Copyright (c) 2007 The NetBSD Foundation, Inc. + * Copyright (c) 2007,2008 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Andrew Doran. + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -133,6 +135,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v #include #include #include +#include #include @@ -710,12 +713,28 @@ bread(struct vnode *vp, daddr_t blkno, i buf_t **bpp) { buf_t *bp; + int error; /* Get buffer for block. */ bp = *bpp = bio_doread(vp, blkno, size, cred, 0); /* Wait for the read to complete, and return result. */ - return (biowait(bp)); + error = biowait(bp); + if (!error) { + struct mount *mp = wapbl_vptomp(vp); + + if (mp && mp->mnt_wapbl_replay && + WAPBL_REPLAY_ISOPEN(mp)) { + error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, + bp->b_bcount); + if (error) { + mutex_enter(&bufcache_lock); + SET(bp->b_cflags, BC_INVAL); + mutex_exit(&bufcache_lock); + } + } + } + return error; } /* @@ -787,6 +806,13 @@ bwrite(buf_t *bp) mp = NULL; } + if (mp && mp->mnt_wapbl) { + if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { + bdwrite(bp); + return 0; + } + } + /* * Remember buffer type, to switch on it later. If the write was * synchronous, but the file system was mounted with MNT_ASYNC, @@ -888,14 +914,22 @@ bdwrite(buf_t *bp) return; } - /* + if (wapbl_vphaswapbl(bp->b_vp)) { + struct mount *mp = wapbl_vptomp(bp->b_vp); + + if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { + WAPBL_ADD_BUF(mp, bp); + } + } + + /* * If the block hasn't been seen before: - * (1) Mark it as having been seen, - * (2) Charge for the write, - * (3) Make sure it's on its vnode's correct block list. + * (1) Mark it as having been seen, + * (2) Charge for the write, + * (3) Make sure it's on its vnode's correct block list. */ KASSERT(bp->b_vp == NULL || bp->b_objlock == &bp->b_vp->v_interlock); - + if (!ISSET(bp->b_oflags, BO_DELWRI)) { mutex_enter(&bufcache_lock); mutex_enter(bp->b_objlock); @@ -1018,6 +1052,16 @@ brelsel(buf_t *bp, int set) if (bioopsp != NULL) (*bioopsp->io_deallocate)(bp); + if (ISSET(bp->b_flags, B_LOCKED)) { + if (wapbl_vphaswapbl(vp = bp->b_vp)) { + struct mount *mp = wapbl_vptomp(vp); + + KASSERT(bp->b_iodone + != mp->mnt_wapbl_op->wo_wapbl_biodone); + WAPBL_REMOVE_BUF(mp, bp); + } + } + mutex_enter(bp->b_objlock); CLR(bp->b_oflags, BO_DONE|BO_DELWRI); if ((vp = bp->b_vp) != NULL) { @@ -1212,19 +1256,22 @@ geteblk(int size) int allocbuf(buf_t *bp, int size, int preserve) { - vsize_t oldsize, desired_size; void *addr; + vsize_t oldsize, desired_size; + int oldcount; int delta; desired_size = buf_roundsize(size); if (desired_size > MAXBSIZE) printf("allocbuf: buffer larger than MAXBSIZE requested"); + oldcount = bp->b_bcount; + bp->b_bcount = size; oldsize = bp->b_bufsize; if (oldsize == desired_size) - return 0; + goto out; /* * If we want a buffer of a different size, re-allocate the @@ -1262,6 +1309,11 @@ allocbuf(buf_t *bp, int size, int preser } } mutex_exit(&bufcache_lock); + + out: + if (wapbl_vphaswapbl(bp->b_vp)) + WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount); + return 0; } Index: sys/kern/vfs_lookup.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_lookup.c,v retrieving revision 1.104 diff -d -p -u -r1.104 vfs_lookup.c --- sys/kern/vfs_lookup.c 30 Jan 2008 11:47:00 -0000 1.104 +++ sys/kern/vfs_lookup.c 2 Mar 2008 10:27:02 -0000 @@ -954,8 +954,10 @@ relookup(struct vnode *dvp, struct vnode if (cnp->cn_nameptr[0] == '\0') panic("relookup: null name"); +#ifdef ohcrap if (cnp->cn_flags & ISDOTDOT) panic("relookup: lookup on dot-dot"); +#endif /* * We now have a segment name to search for, and a directory to search. Index: sys/kern/vfs_subr.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_subr.c,v retrieving revision 1.335 diff -d -p -u -r1.335 vfs_subr.c --- sys/kern/vfs_subr.c 24 Feb 2008 23:16:24 -0000 1.335 +++ sys/kern/vfs_subr.c 2 Mar 2008 10:27:06 -0000 @@ -107,6 +107,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v #include #include #include +#include #include #include @@ -1294,8 +1295,13 @@ vclean(vnode_t *vp, int flags) */ if (flags & DOCLOSE) { error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0); - if (error != 0) + if (error != 0) { + /* XXX, fix vn_start_write's grab of mp and use that. */ + + if (wapbl_vphaswapbl(vp)) + WAPBL_DISCARD(wapbl_vptomp(vp)); error = vinvalbuf(vp, 0, NOCRED, l, 0, 0); + } KASSERT(error == 0); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); if (active && (vp->v_type == VBLK || vp->v_type == VCHR)) { Index: sys/kern/vfs_subr2.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_subr2.c,v retrieving revision 1.19 diff -d -p -u -r1.19 vfs_subr2.c --- sys/kern/vfs_subr2.c 15 Feb 2008 13:46:04 -0000 1.19 +++ sys/kern/vfs_subr2.c 2 Mar 2008 10:27:11 -0000 @@ -343,6 +343,12 @@ restart: * there is a slight chance that a delayed write will * occur while sleeping just above, so check for it. */ + + /* + * XXX this is very bad if MNT_LOG, although + * will get caught when bwrite is told to write out + * the locked buffer. + */ if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) { #ifdef DEBUG printf("buffer still DELWRI\n"); @@ -1252,10 +1258,10 @@ vfs_mount_print(struct mount *mp, int fu char sbuf[256]; (*pr)("vnodecovered = %p syncer = %p data = %p\n", - mp->mnt_vnodecovered,mp->mnt_syncer,mp->mnt_data); + mp->mnt_vnodecovered, mp->mnt_syncer, mp->mnt_data); (*pr)("fs_bshift %d dev_bshift = %d\n", - mp->mnt_fs_bshift,mp->mnt_dev_bshift); + mp->mnt_fs_bshift, mp->mnt_dev_bshift); bitmask_snprintf(mp->mnt_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); (*pr)("flag = %s\n", sbuf); @@ -1266,38 +1272,41 @@ vfs_mount_print(struct mount *mp, int fu (*pr)("refcnt = %d lock @ %p writer = %p\n", mp->mnt_refcnt, &mp->mnt_lock, mp->mnt_writer); + (*pr)("wapbl = %p, wapbl_replay = %p\n", + mp->mnt_wapbl, mp->mnt_wapbl_replay); + (*pr)("statvfs cache:\n"); - (*pr)("\tbsize = %lu\n",mp->mnt_stat.f_bsize); - (*pr)("\tfrsize = %lu\n",mp->mnt_stat.f_frsize); - (*pr)("\tiosize = %lu\n",mp->mnt_stat.f_iosize); + (*pr)("\tbsize = %lu\n", mp->mnt_stat.f_bsize); + (*pr)("\tfrsize = %lu\n", mp->mnt_stat.f_frsize); + (*pr)("\tiosize = %lu\n", mp->mnt_stat.f_iosize); - (*pr)("\tblocks = %"PRIu64"\n",mp->mnt_stat.f_blocks); - (*pr)("\tbfree = %"PRIu64"\n",mp->mnt_stat.f_bfree); - (*pr)("\tbavail = %"PRIu64"\n",mp->mnt_stat.f_bavail); - (*pr)("\tbresvd = %"PRIu64"\n",mp->mnt_stat.f_bresvd); + (*pr)("\tblocks = %"PRIu64"\n", mp->mnt_stat.f_blocks); + (*pr)("\tbfree = %"PRIu64"\n", mp->mnt_stat.f_bfree); + (*pr)("\tbavail = %"PRIu64"\n", mp->mnt_stat.f_bavail); + (*pr)("\tbresvd = %"PRIu64"\n", mp->mnt_stat.f_bresvd); - (*pr)("\tfiles = %"PRIu64"\n",mp->mnt_stat.f_files); - (*pr)("\tffree = %"PRIu64"\n",mp->mnt_stat.f_ffree); - (*pr)("\tfavail = %"PRIu64"\n",mp->mnt_stat.f_favail); - (*pr)("\tfresvd = %"PRIu64"\n",mp->mnt_stat.f_fresvd); + (*pr)("\tfiles = %"PRIu64"\n", mp->mnt_stat.f_files); + (*pr)("\tffree = %"PRIu64"\n", mp->mnt_stat.f_ffree); + (*pr)("\tfavail = %"PRIu64"\n", mp->mnt_stat.f_favail); + (*pr)("\tfresvd = %"PRIu64"\n", mp->mnt_stat.f_fresvd); (*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n", mp->mnt_stat.f_fsidx.__fsid_val[0], mp->mnt_stat.f_fsidx.__fsid_val[1]); - (*pr)("\towner = %"PRIu32"\n",mp->mnt_stat.f_owner); - (*pr)("\tnamemax = %lu\n",mp->mnt_stat.f_namemax); + (*pr)("\towner = %"PRIu32"\n", mp->mnt_stat.f_owner); + (*pr)("\tnamemax = %lu\n", mp->mnt_stat.f_namemax); bitmask_snprintf(mp->mnt_stat.f_flag, __MNT_FLAG_BITS, sbuf, sizeof(sbuf)); - (*pr)("\tflag = %s\n",sbuf); - (*pr)("\tsyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_syncwrites); - (*pr)("\tasyncwrites = %" PRIu64 "\n",mp->mnt_stat.f_asyncwrites); - (*pr)("\tsyncreads = %" PRIu64 "\n",mp->mnt_stat.f_syncreads); - (*pr)("\tasyncreads = %" PRIu64 "\n",mp->mnt_stat.f_asyncreads); - (*pr)("\tfstypename = %s\n",mp->mnt_stat.f_fstypename); - (*pr)("\tmntonname = %s\n",mp->mnt_stat.f_mntonname); - (*pr)("\tmntfromname = %s\n",mp->mnt_stat.f_mntfromname); + (*pr)("\tflag = %s\n", sbuf); + (*pr)("\tsyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_syncwrites); + (*pr)("\tasyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_asyncwrites); + (*pr)("\tsyncreads = %" PRIu64 "\n", mp->mnt_stat.f_syncreads); + (*pr)("\tasyncreads = %" PRIu64 "\n", mp->mnt_stat.f_asyncreads); + (*pr)("\tfstypename = %s\n", mp->mnt_stat.f_fstypename); + (*pr)("\tmntonname = %s\n", mp->mnt_stat.f_mntonname); + (*pr)("\tmntfromname = %s\n", mp->mnt_stat.f_mntfromname); { int cnt = 0; Index: sys/kern/vfs_syscalls.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_syscalls.c,v retrieving revision 1.345 diff -d -p -u -r1.345 vfs_syscalls.c --- sys/kern/vfs_syscalls.c 30 Jan 2008 11:47:01 -0000 1.345 +++ sys/kern/vfs_syscalls.c 2 Mar 2008 10:27:17 -0000 @@ -211,12 +211,16 @@ mount_update(struct lwp *l, struct vnode mp->mnt_flag &= ~(MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP | - MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP); + MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP | + MNT_LOG); mp->mnt_flag |= flags & (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP | MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP | - MNT_IGNORE); + MNT_LOG | MNT_IGNORE); +#if 1 /* XXX "mount -u -o log" doesn't work on -current */ + mp->mnt_flag &= ~MNT_LOG; +#endif /* XXX */ error = VFS_MOUNT(mp, path, data, data_len); @@ -359,7 +363,7 @@ mount_domount(struct lwp *l, struct vnod (MNT_FORCE | MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOCOREDUMP | MNT_NOATIME | MNT_NODEVMTIME | MNT_SYMPERM | MNT_SOFTDEP | - MNT_IGNORE | MNT_RDONLY); + MNT_LOG | MNT_IGNORE | MNT_RDONLY); error = VFS_MOUNT(mp, path, data, data_len); mp->mnt_flag &= ~MNT_OP_FLAGS; Index: sys/kern/vfs_vnops.c =================================================================== RCS file: /cvsroot/src/sys/kern/vfs_vnops.c,v retrieving revision 1.154 diff -d -p -u -r1.154 vfs_vnops.c --- sys/kern/vfs_vnops.c 30 Jan 2008 09:50:22 -0000 1.154 +++ sys/kern/vfs_vnops.c 2 Mar 2008 10:27:19 -0000 @@ -61,6 +61,7 @@ __KERNEL_RCSID(0, "$NetBSD: vfs_vnops.c, #include #include #include +#include #include @@ -678,6 +679,11 @@ vn_lock(struct vnode *vp, int flags) LK_CANRECURSE)) == 0); +#ifdef DIAGNOSTIC + if (wapbl_vphaswapbl(vp)) + WAPBL_JUNLOCK_ASSERT(wapbl_vptomp(vp)); +#endif + do { if ((flags & LK_INTERLOCK) == 0) mutex_enter(&vp->v_interlock); Index: sys/kern/vfs_wapbl.c =================================================================== RCS file: sys/kern/vfs_wapbl.c diff -N sys/kern/vfs_wapbl.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/kern/vfs_wapbl.c 2 Mar 2008 10:27:25 -0000 @@ -0,0 +1,2789 @@ +/* $NetBSD: vfs_wapbl.c,v 1.51 2007/10/09 15:10:15 simonb Exp $ */ + +/*- + * Copyright (c) 2003,2008 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Wasabi Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This implements file system independent write ahead filesystem logging. + */ +#include +__KERNEL_RCSID(0, "$NetBSD: vfs_wapbl.c,v 1.51 2007/10/09 15:10:15 simonb Exp $"); + +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if WAPBL_UVM_ALLOC +#include +#endif + +#include + +MALLOC_JUSTDEFINE(M_WAPBL, "wapbl", "write-ahead physical block logging"); +#define wapbl_malloc(s) malloc((s), M_WAPBL, M_WAITOK) +#define wapbl_free(a) free((a), M_WAPBL) +#define wapbl_calloc(n, s) malloc((n)*(s), M_WAPBL, M_WAITOK | M_ZERO) + +#else /* !_KERNEL */ +#include +#include +#include +#include +#include +#include + +#include +#include + +#define KDASSERT(x) assert(x) +#define KASSERT(x) assert(x) +#define wapbl_malloc(s) malloc(s) +#define wapbl_free(a) free(a) +#define wapbl_calloc(n, s) calloc((n), (s)) + +#endif /* !_KERNEL */ + +/* + * INTERNAL DATA STRUCTURES + */ + +/* + * This structure holds per-mount log information. + * + * Legend: a = atomic access only + * r = read-only after init + * l = rwlock held + * m = mutex held + * u = unlocked access ok + * b = bufcache_lock held + */ +struct wapbl { + struct vnode *wl_logvp; /* r: log here */ + struct vnode *wl_devvp; /* r: log on this device */ + struct mount *wl_mount; /* r: mountpoint wl is associated with */ + daddr_t wl_logpbn; /* r: Physical block number of start of log */ + int wl_log_dev_bshift; /* r: logarithm of device block size of log + device */ + int wl_fs_dev_bshift; /* r: logarithm of device block size of + filesystem device */ + + unsigned wl_lock_count; /* a: Count of transactions in progress */ + + size_t wl_circ_size; /* r: Number of bytes in buffer of log */ + size_t wl_circ_off; /* r: Number of bytes reserved at start */ + + size_t wl_bufcount_max; /* r: Number of buffers reserved for log */ + size_t wl_bufbytes_max; /* r: Number of buf bytes reserved for log */ + + off_t wl_head; /* l: Byte offset of log head */ + off_t wl_tail; /* l: Byte offset of log tail */ + /* + * head == tail == 0 means log is empty + * head == tail != 0 means log is full + * see assertions in wapbl_advance() for other boundary conditions. + * only truncate moves the tail, except when flush sets it to + * wl_header_size only flush moves the head, except when truncate + * sets it to 0. + */ + + struct wapbl_wc_header *wl_wc_header; /* l */ + void *wl_wc_scratch; /* l: scratch space (XXX: por que?!?) */ + + kmutex_t wl_mtx; /* u: short-term lock */ + krwlock_t wl_rwlock; /* u: File system transaction lock */ + + /* + * Must be held while accessing + * wl_count or wl_bufs or head or tail + */ + + /* + * Callback called from within the flush routine to flush any extra + * bits. Note that flush may be skipped without calling this if + * there are no outstanding buffers in the transaction. + */ + wapbl_flush_fn_t wl_flush; /* r */ + wapbl_flush_fn_t wl_flush_abort;/* r */ + + size_t wl_bufbytes; /* m: Byte count of pages in wl_bufs */ + size_t wl_bufcount; /* m: Count of buffers in wl_bufs */ + size_t wl_bcount; /* m: Total bcount of wl_bufs */ + + LIST_HEAD(, buf) wl_bufs; /* m: Buffers in current transaction */ + + kcondvar_t wl_reclaimable_cv; /* m (obviously) */ + size_t wl_reclaimable_bytes; /* m: Amount of space available for + reclaimation by truncate */ + int wl_error_count; /* m: # of wl_entries with errors */ + size_t wl_reserved_bytes; /* never truncate log smaller than this */ + +#ifdef WAPBL_DEBUG_BUFBYTES + size_t wl_unsynced_bufbytes; /* Byte count of unsynced buffers */ +#endif + + daddr_t *wl_deallocblks;/* l: address of block */ + int *wl_dealloclens; /* l: size of block (fragments, kom ihåg) */ + int wl_dealloccnt; /* l: total count */ + int wl_dealloclim; /* l: max count */ + + /* hashtable of inode numbers for allocated but unlinked inodes */ + /* synch ??? */ + LIST_HEAD(wapbl_ino_head, wapbl_ino) *wl_inohash; + u_long wl_inohashmask; + int wl_inohashcnt; + + SIMPLEQ_HEAD(, wapbl_entry) wl_entries; /* On disk transaction + accounting */ +}; + +#ifdef WAPBL_DEBUG_PRINT +int wapbl_debug_print = WAPBL_DEBUG_PRINT; +#endif + +/****************************************************************/ +#ifdef _KERNEL + +#ifdef WAPBL_DEBUG +struct wapbl *wapbl_debug_wl; +#endif + +static int wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail); +static int wapbl_write_blocks(struct wapbl *wl, off_t *offp); +static int wapbl_write_revocations(struct wapbl *wl, off_t *offp); +static int wapbl_write_inodes(struct wapbl *wl, off_t *offp); +#endif /* _KERNEL */ + +static int wapbl_replay_prescan(struct wapbl_replay *wr); +static int wapbl_replay_get_inodes(struct wapbl_replay *wr); + +static __inline size_t wapbl_space_free(size_t avail, off_t head, + off_t tail); +static __inline size_t wapbl_space_used(size_t avail, off_t head, + off_t tail); + +#ifdef _KERNEL + +#define WAPBL_INODETRK_SIZE 83 +static int wapbl_ino_pool_refcount; +static struct pool wapbl_ino_pool; +struct wapbl_ino { + LIST_ENTRY(wapbl_ino) wi_hash; + ino_t wi_ino; + mode_t wi_mode; +}; + +static kmutex_t wapbl_global_mtx; + +static void wapbl_inodetrk_init(struct wapbl *wl, u_int size); +static void wapbl_inodetrk_free(struct wapbl *wl); +static struct wapbl_ino *wapbl_inodetrk_get(struct wapbl *wl, ino_t ino); + +static size_t wapbl_transaction_len(struct wapbl *wl); +static __inline size_t wapbl_transaction_inodes_len(struct wapbl *wl); + +/* + * This is useful for debugging. If set, the log will + * only be truncated when necessary. + */ +int wapbl_lazy_truncate = 0; + +struct wapbl_ops wapbl_ops = { + .wo_wapbl_discard = wapbl_discard, + .wo_wapbl_replay_isopen = wapbl_replay_isopen1, + .wo_wapbl_replay_read = wapbl_replay_read, + .wo_wapbl_add_buf = wapbl_add_buf, + .wo_wapbl_remove_buf = wapbl_remove_buf, + .wo_wapbl_resize_buf = wapbl_resize_buf, + .wo_wapbl_begin = wapbl_begin, + .wo_wapbl_end = wapbl_end, + .wo_wapbl_junlock_assert= wapbl_junlock_assert, + + /* XXX: the following is only used to say "this is a wapbl buf" */ + .wo_wapbl_biodone = wapbl_biodone, +}; + +void +wapbl_init() +{ + + mutex_init(&wapbl_global_mtx, MUTEX_DEFAULT, IPL_NONE); + malloc_type_attach(M_WAPBL); +} + +int +wapbl_start(struct wapbl ** wlp, struct mount *mp, struct vnode *vp, + daddr_t off, size_t count, size_t blksize, struct wapbl_replay *wr, + wapbl_flush_fn_t flushfn, wapbl_flush_fn_t flushabortfn) +{ + struct wapbl *wl; + struct vnode *devvp; + daddr_t logpbn; + int error; + int log_dev_bshift = DEV_BSHIFT; + int fs_dev_bshift = DEV_BSHIFT; + int run; + + WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_start: vp=%p off=%" PRId64 + " count=%zu blksize=%zu\n", vp, off, count, blksize)); + + if (log_dev_bshift > fs_dev_bshift) { + WAPBL_PRINTF(WAPBL_PRINT_OPEN, + ("wapbl: log device's block size cannot be larger " + "than filesystem's\n")); + /* + * Not currently implemented, although it could be if + * needed someday. + */ + return ENOSYS; + } + + if (off < 0) + return EINVAL; + + if (blksize < DEV_BSIZE) + return EINVAL; + if (blksize % DEV_BSIZE) + return EINVAL; + + /* XXXTODO: verify that the full load is writable */ + + /* + * XXX check for minimum log size + * minimum is governed by minimum amount of space + * to complete a transaction. (probably truncate) + */ + /* XXX for now pick something minimal */ + if ((count * blksize) < MAXPHYS) { + return ENOSPC; + } + + if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, &run)) != 0) { + return error; + } + + wl = wapbl_calloc(1, sizeof(*wl)); + rw_init(&wl->wl_rwlock); + mutex_init(&wl->wl_mtx, MUTEX_DEFAULT, IPL_NONE); + cv_init(&wl->wl_reclaimable_cv, "wapblrec"); + LIST_INIT(&wl->wl_bufs); + SIMPLEQ_INIT(&wl->wl_entries); + + wl->wl_logvp = vp; + wl->wl_devvp = devvp; + wl->wl_mount = mp; + wl->wl_logpbn = logpbn; + wl->wl_log_dev_bshift = log_dev_bshift; + wl->wl_fs_dev_bshift = fs_dev_bshift; + + wl->wl_flush = flushfn; + wl->wl_flush_abort = flushabortfn; + + /* Reserve two log device blocks for the commit headers */ + wl->wl_circ_off = 2<wl_log_dev_bshift; + wl->wl_circ_size = ((count * blksize) - wl->wl_circ_off); + /* truncate the log usage to a multiple of log_dev_bshift */ + wl->wl_circ_size >>= wl->wl_log_dev_bshift; + wl->wl_circ_size <<= wl->wl_log_dev_bshift; + + /* + * wl_bufbytes_max limits the size of the in memory transaction space. + * - Since buffers are allocated and accounted for in units of + * PAGE_SIZE it is required to be a multiple of PAGE_SIZE + * (i.e. 1<wl_bufbytes_max = MIN(wl->wl_circ_size, buf_memcalc()/2); + + /* Round wl_bufbytes_max to the largest power of two constraint */ + wl->wl_bufbytes_max >>= PAGE_SHIFT; + wl->wl_bufbytes_max <<= PAGE_SHIFT; + wl->wl_bufbytes_max >>= wl->wl_log_dev_bshift; + wl->wl_bufbytes_max <<= wl->wl_log_dev_bshift; + wl->wl_bufbytes_max >>= wl->wl_fs_dev_bshift; + wl->wl_bufbytes_max <<= wl->wl_fs_dev_bshift; + + /* XXX maybe use filesystem fragment size instead of 1024 */ + /* XXX fix actual number of buffers reserved per filesystem. */ + wl->wl_bufcount_max = (nbuf/2)*1024; + + /* XXX tie this into resource estimation */ + wl->wl_dealloclim = 2*btodb(wl->wl_bufbytes_max); + +#if WAPBL_UVM_ALLOC + wl->wl_deallocblks = (void *) uvm_km_zalloc(kernel_map, + round_page(sizeof(*wl->wl_deallocblks) * wl->wl_dealloclim)); + KASSERT(wl->wl_deallocblks != NULL); + wl->wl_dealloclens = (void *) uvm_km_zalloc(kernel_map, + round_page(sizeof(*wl->wl_dealloclens) * wl->wl_dealloclim)); + KASSERT(wl->wl_dealloclens != NULL); +#else + wl->wl_deallocblks = wapbl_malloc(sizeof(*wl->wl_deallocblks) * + wl->wl_dealloclim); + wl->wl_dealloclens = wapbl_malloc(sizeof(*wl->wl_dealloclens) * + wl->wl_dealloclim); +#endif + + wapbl_inodetrk_init(wl, WAPBL_INODETRK_SIZE); + + /* Initialize the commit header */ + { + struct wapbl_wc_header *wc; + size_t len = 1<wl_log_dev_bshift; + wc = wapbl_calloc(1, len); + wc->wc_type = WAPBL_WC_HEADER; + wc->wc_len = len; + wc->wc_circ_off = wl->wl_circ_off; + wc->wc_circ_size = wl->wl_circ_size; + /* XXX wc->wc_fsid */ + wc->wc_log_dev_bshift = wl->wl_log_dev_bshift; + wc->wc_fs_dev_bshift = wl->wl_fs_dev_bshift; + wl->wl_wc_header = wc; + wl->wl_wc_scratch = wapbl_malloc(len); + } + + /* + * if there was an existing set of unlinked but + * allocated inodes, preserve it in the new + * log. + */ + if (wr && wr->wr_inodescnt) { + int i; + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, + ("wapbl_start: reusing log with %d inodes\n", + wr->wr_inodescnt)); + + /* + * Its only valid to reuse the replay log if its + * the same as the new log we just opened. + */ + KDASSERT(!wapbl_replay_isopen(wr)); + KASSERT(devvp->v_rdev == wr->wr_devvp->v_rdev); + KASSERT(logpbn == wr->wr_logpbn); + KASSERT(wl->wl_circ_size == wr->wr_wc_header.wc_circ_size); + KASSERT(wl->wl_circ_off == wr->wr_wc_header.wc_circ_off); + KASSERT(wl->wl_log_dev_bshift == + wr->wr_wc_header.wc_log_dev_bshift); + KASSERT(wl->wl_fs_dev_bshift == + wr->wr_wc_header.wc_fs_dev_bshift); + + wl->wl_wc_header->wc_generation = + wr->wr_wc_header.wc_generation+1; + + for (i=0;iwr_inodescnt;i++) + wapbl_register_inode(wl, wr->wr_inodes[i].wr_inumber, + wr->wr_inodes[i].wr_imode); + + /* Make sure new transaction won't overwrite old inodes list */ + KDASSERT(wapbl_transaction_len(wl) <= + wapbl_space_free(wl->wl_circ_size, wr->wr_inodeshead, + wr->wr_inodestail)); + + wl->wl_head = wl->wl_tail = wr->wr_inodeshead; + wl->wl_reclaimable_bytes = wl->wl_reserved_bytes = + wapbl_transaction_len(wl); + + error = wapbl_write_inodes(wl, &wl->wl_head); + if (error) + goto errout; + + KASSERT(wl->wl_head != wl->wl_tail); + KASSERT(wl->wl_head != 0); + } + + error = wapbl_write_commit(wl, wl->wl_head, wl->wl_tail); + if (error) { + goto errout; + } + + *wlp = wl; +#if defined(WAPBL_DEBUG) + wapbl_debug_wl = wl; +#endif + + return 0; + errout: + wapbl_discard(wl); + wapbl_free(wl->wl_wc_scratch); + wapbl_free(wl->wl_wc_header); +#if WAPBL_UVM_ALLOC + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, + round_page(sizeof(*wl->wl_deallocblks * + wl->wl_dealloclim))); + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, + round_page(sizeof(*wl->wl_dealloclens * + wl->wl_dealloclim))); +#else + wapbl_free(wl->wl_deallocblks); + wapbl_free(wl->wl_dealloclens); +#endif + wapbl_inodetrk_free(wl); + wapbl_free(wl); + + return error; +} + +/* + * Like wapbl_flush, only discards the transaction + * completely + */ + +void +wapbl_discard(struct wapbl *wl) +{ + struct wapbl_entry *we; + struct buf *bp; + int i; + + /* + * XXX we may consider using upgrade here + * if we want to call flush from inside a transaction + */ + rw_enter(&wl->wl_rwlock, RW_WRITER); + wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, + wl->wl_dealloccnt); + +#ifdef WAPBL_DEBUG_PRINT + { + struct wapbl_entry *we; + pid_t pid = -1; + lwpid_t lid = -1; + if (curproc) + pid = curproc->p_pid; + if (curlwp) + lid = curlwp->l_lid; +#ifdef WAPBL_DEBUG_BUFBYTES + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("wapbl_discard: thread %d.%d discarding " + "transaction\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %u, reclaimable=%zu reserved=%zu " + "unsynced=%zu\n", + pid, lid, wl->wl_bufcount, wl->wl_bufbytes, + wl->wl_bcount, wl->wl_dealloccnt, + wl->wl_inohashcnt, wl->wl_error_count, + wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, + wl->wl_unsynced_bufbytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d, unsynced = %zu\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error, we->we_unsynced_bufbytes)); + } +#else /* !WAPBL_DEBUG_BUFBYTES */ + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("wapbl_discard: thread %d.%d discarding transaction\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %u, reclaimable=%zu reserved=%zu\n", + pid, lid, wl->wl_bufcount, wl->wl_bufbytes, + wl->wl_bcount, wl->wl_dealloccnt, + wl->wl_inohashcnt, wl->wl_error_count, + wl->wl_reclaimable_bytes, wl->wl_reserved_bytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_DISCARD, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error)); + } +#endif /* !WAPBL_DEBUG_BUFBYTES */ + } +#endif /* WAPBL_DEBUG_PRINT */ + + for (i = 0; i <= wl->wl_inohashmask; i++) { + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + + wih = &wl->wl_inohash[i]; + while ((wi = LIST_FIRST(wih)) != NULL) { + LIST_REMOVE(wi, wi_hash); + pool_put(&wapbl_ino_pool, wi); + KASSERT(wl->wl_inohashcnt > 0); + wl->wl_inohashcnt--; + } + } + + /* + * clean buffer list + */ + mutex_enter(&bufcache_lock); + mutex_enter(&wl->wl_mtx); + while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { + if (bbusy(bp, 0, 0, &wl->wl_mtx) == 0) { + /* + * The buffer will be unlocked and + * removed from the transaction in brelse + */ + mutex_exit(&wl->wl_mtx); + brelsel(bp, 0); + mutex_enter(&wl->wl_mtx); + } + } + mutex_exit(&wl->wl_mtx); + mutex_exit(&bufcache_lock); + + /* + * Remove references to this wl from wl_entries, free any which + * no longer have buffers, others will be freed in wapbl_biodone + * when they no longer have any buffers. + */ + while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) != NULL) { + SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); + /* XXX should we be accumulating wl_error_count + * and increasing reclaimable bytes ? */ + we->we_wapbl = NULL; + if (we->we_bufcount == 0) { +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes == 0); +#endif + wapbl_free(we); + } + } + + /* Discard list of deallocs */ + wl->wl_dealloccnt = 0; + /* XXX should we clear wl_reserved_bytes? */ + + KASSERT(wl->wl_bufbytes == 0); + KASSERT(wl->wl_bcount == 0); + KASSERT(wl->wl_bufcount == 0); + KASSERT(LIST_EMPTY(&wl->wl_bufs)); + KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); + KASSERT(wl->wl_inohashcnt == 0); + + rw_exit(&wl->wl_rwlock); +} + +int +wapbl_stop(struct wapbl *wl, int force) +{ + struct vnode *vp; + int error; + + WAPBL_PRINTF(WAPBL_PRINT_OPEN, ("wapbl_stop called\n")); + error = wapbl_flush(wl, 1); + if (error) { + if (force) + wapbl_discard(wl); + else + return error; + } + + /* Unlinked inodes persist after a flush */ + if (wl->wl_inohashcnt) { + if (force) { + wapbl_discard(wl); + } else { + return EBUSY; + } + } + + KASSERT(wl->wl_bufbytes == 0); + KASSERT(wl->wl_bcount == 0); + KASSERT(wl->wl_bufcount == 0); + KASSERT(LIST_EMPTY(&wl->wl_bufs)); + KASSERT(wl->wl_dealloccnt == 0); + KASSERT(SIMPLEQ_EMPTY(&wl->wl_entries)); + KASSERT(wl->wl_inohashcnt == 0); + + vp = wl->wl_logvp; + + wapbl_free(wl->wl_wc_scratch); + wapbl_free(wl->wl_wc_header); +#if WAPBL_UVM_ALLOC + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_deallocblks, + round_page(sizeof(*wl->wl_deallocblks * + wl->wl_dealloclim))); + uvm_km_free_wakeup(kernel_map, (vaddr_t) wl->wl_dealloclens, + round_page(sizeof(*wl->wl_dealloclens * + wl->wl_dealloclim))); +#else + wapbl_free(wl->wl_deallocblks); + wapbl_free(wl->wl_dealloclens); +#endif + wapbl_inodetrk_free(wl); + wapbl_free(wl); + + return 0; +} + +static int +wapbl_doio(void *data, size_t len, struct vnode *devvp, daddr_t pbn, int flags) +{ + struct pstats *pstats = curlwp->l_proc->p_stats; + struct buf *bp; + int error; + + KASSERT((flags & ~(B_WRITE|B_READ)) == 0); + KASSERT(devvp->v_type == VBLK); + + if ((flags & (B_WRITE|B_READ)) == B_WRITE) { + devvp->v_numoutput++; + pstats->p_ru.ru_oublock++; + } else { + pstats->p_ru.ru_inblock++; + } + + bp = getiobuf(devvp, true); + bp->b_flags = flags; + bp->b_cflags = BC_BUSY; /* silly & dubious */ + bp->b_dev = devvp->v_rdev; + bp->b_data = data; + bp->b_bufsize = bp->b_resid = bp->b_bcount = len; + bp->b_blkno = pbn; + + WAPBL_PRINTF(WAPBL_PRINT_IO, + ("wapbl_doio: %s %d bytes at block %"PRId64" on dev 0x%x\n", + BUF_ISWRITE(bp) ? "write" : "read", bp->b_bcount, + bp->b_blkno, bp->b_dev)); + + VOP_STRATEGY(devvp, bp); + + error = biowait(bp); + putiobuf(bp); + + if (error) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_doio: %s %zu bytes at block %" PRId64 + " on dev 0x%x failed with error %d\n", + (((flags & (B_WRITE|B_READ)) == B_WRITE) ? "write":"read"), + len, pbn, devvp->v_rdev, error)); + } + + return error; +} + +int +wapbl_write(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + return wapbl_doio(data, len, devvp, pbn, B_WRITE); +} + +int +wapbl_read(void *data, size_t len, struct vnode *devvp, daddr_t pbn) +{ + + return wapbl_doio(data, len, devvp, pbn, B_READ); +} + +/* + * Off is byte offset returns new offset for next write + * handles log wraparound + */ +static int +wapbl_circ_write(struct wapbl *wl, void *data, size_t len, off_t *offp) +{ + size_t slen; + off_t off = *offp; + int error; + + KDASSERT(((len >> wl->wl_log_dev_bshift) << + wl->wl_log_dev_bshift) == len); + + if (off < wl->wl_circ_off) + off = wl->wl_circ_off; + slen = wl->wl_circ_off + wl->wl_circ_size - off; + if (slen < len) { + error = wapbl_write(data, slen, wl->wl_devvp, + wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); + if (error) + return error; + data = (uint8_t *)data + slen; + len -= slen; + off = wl->wl_circ_off; + } + error = wapbl_write(data, len, wl->wl_devvp, + wl->wl_logpbn + (off >> wl->wl_log_dev_bshift)); + if (error) + return error; + off += len; + if (off >= wl->wl_circ_off + wl->wl_circ_size) + off = wl->wl_circ_off; + *offp = off; + return 0; +} + +/****************************************************************/ + +int +wapbl_begin(struct wapbl *wl, const char *file, int line) +{ + int doflush; + unsigned lockcount; + krw_t op; + + KDASSERT(wl); + +#ifdef WAPBL_DEBUG_SERIALIZE + op = RW_WRITER; +#else + op = RW_READER; +#endif + + /* + * XXX this needs to be made much more sophisticated. + * perhaps each wapbl_begin could reserve a specified + * number of buffers and bytes. + */ + mutex_enter(&wl->wl_mtx); + lockcount = wl->wl_lock_count; + doflush = ((wl->wl_bufbytes + (lockcount * MAXPHYS)) > + wl->wl_bufbytes_max / 2) || + ((wl->wl_bufcount + (lockcount * 10)) > + wl->wl_bufcount_max / 2) || + (wapbl_transaction_len(wl) > wl->wl_circ_size / 2); + mutex_exit(&wl->wl_mtx); + + if (doflush) { + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("force flush lockcnt=%d bufbytes=%zu " + "(max=%zu) bufcount=%zu (max=%zu)\n", + lockcount, wl->wl_bufbytes, + wl->wl_bufbytes_max, wl->wl_bufcount, + wl->wl_bufcount_max)); + } + + if (doflush) { + int error = wapbl_flush(wl, 0); + if (error) + return error; + } + + rw_enter(&wl->wl_rwlock, op); + mutex_enter(&wl->wl_mtx); + wl->wl_lock_count++; + mutex_exit(&wl->wl_mtx); + +#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) + WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, + ("wapbl_begin thread %d.%d with bufcount=%zu " + "bufbytes=%zu bcount=%zu at %s:%d\n", + curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, file, line)); +#endif + + return 0; +} + +void +wapbl_end(struct wapbl *wl) +{ + +#if defined(WAPBL_DEBUG_PRINT) && defined(WAPBL_DEBUG_SERIALIZE) + WAPBL_PRINTF(WAPBL_PRINT_TRANSACTION, + ("wapbl_end thread %d.%d with bufcount=%zu " + "bufbytes=%zu bcount=%zu\n", + curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount)); +#endif + + mutex_enter(&wl->wl_mtx); + KASSERT(wl->wl_lock_count > 0); + wl->wl_lock_count--; + mutex_exit(&wl->wl_mtx); + + rw_exit(&wl->wl_rwlock); +} + +void +wapbl_add_buf(struct wapbl *wl, struct buf * bp) +{ + + KASSERT(bp->b_cflags & BC_BUSY); + KASSERT(bp->b_vp); + + wapbl_jlock_assert(wl); + +#if 0 + /* + * XXX this might be an issue for swapfiles. + * see uvm_swap.c:1702 + * + * XXX2 why require it then? leap of semantics? + */ + KASSERT((bp->b_cflags & BC_NOCACHE) == 0); +#endif + + mutex_enter(&wl->wl_mtx); + if (bp->b_flags & B_LOCKED) { + LIST_REMOVE(bp, b_wapbllist); + WAPBL_PRINTF(WAPBL_PRINT_BUFFER2, + ("wapbl_add_buf thread %d.%d re-adding buf %p " + "with %d bytes %d bcount\n", + curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, + bp->b_bcount)); + } else { + /* unlocked by dirty buffers shouldn't exist */ + KASSERT(!(bp->b_oflags & BO_DELWRI)); + wl->wl_bufbytes += bp->b_bufsize; + wl->wl_bcount += bp->b_bcount; + wl->wl_bufcount++; + WAPBL_PRINTF(WAPBL_PRINT_BUFFER, + ("wapbl_add_buf thread %d.%d adding buf %p " + "with %d bytes %d bcount\n", + curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, + bp->b_bcount)); + } + LIST_INSERT_HEAD(&wl->wl_bufs, bp, b_wapbllist); + mutex_exit(&wl->wl_mtx); + + bp->b_flags |= B_LOCKED; +} + +static void +wapbl_remove_buf_locked(struct wapbl * wl, struct buf *bp) +{ + + KASSERT(mutex_owned(&wl->wl_mtx)); + KASSERT(bp->b_cflags & BC_BUSY); + wapbl_jlock_assert(wl); + +#if 0 + /* + * XXX this might be an issue for swapfiles. + * see uvm_swap.c:1725 + * + * XXXdeux: see above + */ + KASSERT((bp->b_flags & BC_NOCACHE) == 0); +#endif + KASSERT(bp->b_flags & B_LOCKED); + + WAPBL_PRINTF(WAPBL_PRINT_BUFFER, + ("wapbl_remove_buf thread %d.%d removing buf %p with " + "%d bytes %d bcount\n", + curproc->p_pid, curlwp->l_lid, bp, bp->b_bufsize, bp->b_bcount)); + + KASSERT(wl->wl_bufbytes >= bp->b_bufsize); + wl->wl_bufbytes -= bp->b_bufsize; + KASSERT(wl->wl_bcount >= bp->b_bcount); + wl->wl_bcount -= bp->b_bcount; + KASSERT(wl->wl_bufcount > 0); + wl->wl_bufcount--; + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); + LIST_REMOVE(bp, b_wapbllist); + + bp->b_flags &= ~B_LOCKED; +} + +/* called from brelsel() in vfs_bio among other places */ +void +wapbl_remove_buf(struct wapbl * wl, struct buf *bp) +{ + + mutex_enter(&wl->wl_mtx); + wapbl_remove_buf_locked(wl, bp); + mutex_exit(&wl->wl_mtx); +} + +void +wapbl_resize_buf(struct wapbl *wl, struct buf *bp, long oldsz, long oldcnt) +{ + + KASSERT(bp->b_cflags & BC_BUSY); + + /* + * XXX: why does this depend on B_LOCKED? otherwise the buf + * is not for a transaction? if so, why is this called in the + * first place? + */ + if (bp->b_flags & B_LOCKED) { + mutex_enter(&wl->wl_mtx); + wl->wl_bufbytes += bp->b_bufsize - oldsz; + wl->wl_bcount += bp->b_bcount - oldcnt; + mutex_exit(&wl->wl_mtx); + } +} + +#endif /* _KERNEL */ + +/****************************************************************/ +/* Some utility inlines */ + +/* This is used to advance the pointer at old to new value at old+delta */ +static __inline off_t +wapbl_advance(size_t size, size_t off, off_t old, size_t delta) +{ + off_t new; + + /* Define acceptable ranges for inputs. */ + KASSERT(delta <= size); + KASSERT((old == 0) || (old >= off)); + KASSERT(old < (size + off)); + + if ((old == 0) && (delta != 0)) + new = off + delta; + else if ((old+delta) < (size + off)) + new = old+delta; + else + new = (old+delta) - size; + + /* Note some interesting axioms */ + KASSERT((delta != 0) || (new == old)); + KASSERT((delta == 0) || (new != 0)); + KASSERT((delta != (size)) || (new == old)); + + /* Define acceptable ranges for output. */ + KASSERT((new == 0) || (new >= off)); + KASSERT(new < (size + off)); + return new; +} + +static __inline size_t +wapbl_space_used(size_t avail, off_t head, off_t tail) +{ + + if (tail == 0) { + KASSERT(head == 0); + return 0; + } + return ((head+(avail-1)-tail)%avail)+1; +} + +static __inline size_t +wapbl_space_free(size_t avail, off_t head, off_t tail) +{ + + return avail - wapbl_space_used(avail, head, tail); +} + +static __inline void +wapbl_advance_head(size_t size, size_t off, size_t delta, off_t *headp, + off_t *tailp) +{ + off_t head = *headp; + off_t tail = *tailp; + + KASSERT(delta <= wapbl_space_free(size, head, tail)); + head = wapbl_advance(size, off, head, delta); + if ((tail == 0) && (head != 0)) + tail = off; + *headp = head; + *tailp = tail; +} + +static __inline void +wapbl_advance_tail(size_t size, size_t off, size_t delta, off_t *headp, + off_t *tailp) +{ + off_t head = *headp; + off_t tail = *tailp; + + KASSERT(delta <= wapbl_space_used(size, head, tail)); + tail = wapbl_advance(size, off, tail, delta); + if (head == tail) { + head = tail = 0; + } + *headp = head; + *tailp = tail; +} + +#ifdef _KERNEL + +/****************************************************************/ + +/* + * Remove transactions whose buffers are completely flushed to disk. + * Will block until at least minfree space is available. + * only intended to be called from inside wapbl_flush and therefore + * does not protect against commit races with itself or with flush. + */ +static int +wapbl_truncate(struct wapbl *wl, size_t minfree, int waitonly) +{ + size_t delta; + size_t avail; + off_t head; + off_t tail; + int error = 0; + + KASSERT(minfree <= (wl->wl_circ_size - wl->wl_reserved_bytes)); + KASSERT(rw_write_held(&wl->wl_rwlock)); + + mutex_enter(&wl->wl_mtx); + + /* + * First check to see if we have to do a commit + * at all. + */ + avail = wapbl_space_free(wl->wl_circ_size, wl->wl_head, wl->wl_tail); + if (minfree < avail) { + mutex_exit(&wl->wl_mtx); + return 0; + } + minfree -= avail; + while ((wl->wl_error_count == 0) && + (wl->wl_reclaimable_bytes < minfree)) { + WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, + ("wapbl_truncate: sleeping on %p wl=%p bytes=%zd " + "minfree=%zd\n", + &wl->wl_reclaimable_bytes, wl, wl->wl_reclaimable_bytes, + minfree)); + + cv_wait(&wl->wl_reclaimable_cv, &wl->wl_mtx); + } + if (wl->wl_reclaimable_bytes < minfree) { + KASSERT(wl->wl_error_count); + /* XXX maybe get actual error from buffer instead someday? */ + error = EIO; + } + head = wl->wl_head; + tail = wl->wl_tail; + delta = wl->wl_reclaimable_bytes; + + /* If all of of the entries are flushed, then be sure to keep + * the reserved bytes reserved. Watch out for discarded transactions, + * which could leave more bytes reserved than are reclaimable. + */ + if (SIMPLEQ_EMPTY(&wl->wl_entries) && + (delta >= wl->wl_reserved_bytes)) { + delta -= wl->wl_reserved_bytes; + } + wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, &head, + &tail); + KDASSERT(wl->wl_reserved_bytes <= + wapbl_space_used(wl->wl_circ_size, head, tail)); + mutex_exit(&wl->wl_mtx); + + if (error) + return error; + + if (waitonly) + return 0; + + /* + * This is where head, tail and delta are unprotected + * from races against itself or flush. This is ok since + * we only call this routine from inside flush itself. + * + * XXX: how can it race against itself when accessed only + * from behind the write-locked rwlock? + */ + error = wapbl_write_commit(wl, head, tail); + if (error) + return error; + + wl->wl_head = head; + wl->wl_tail = tail; + + mutex_enter(&wl->wl_mtx); + KASSERT(wl->wl_reclaimable_bytes >= delta); + wl->wl_reclaimable_bytes -= delta; + mutex_exit(&wl->wl_mtx); + WAPBL_PRINTF(WAPBL_PRINT_TRUNCATE, + ("wapbl_truncate thread %d.%d truncating %zu bytes\n", + curproc->p_pid, curlwp->l_lid, delta)); + + return 0; +} + +/****************************************************************/ + +void +wapbl_biodone(struct buf *bp) +{ + struct wapbl_entry *we = bp->b_private; + struct wapbl *wl = we->we_wapbl; + + /* + * Handle possible flushing of buffers after log has been + * decomissioned. + */ + if (!wl) { + KASSERT(we->we_bufcount > 0); + we->we_bufcount--; +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); + we->we_unsynced_bufbytes -= bp->b_bufsize; +#endif + + if (we->we_bufcount == 0) { +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes == 0); +#endif + wapbl_free(we); + } + + brelse(bp, 0); + return; + } + +#ifdef ohbother + KDASSERT(bp->b_flags & B_DONE); + KDASSERT(!(bp->b_flags & B_DELWRI)); + KDASSERT(bp->b_flags & B_ASYNC); + KDASSERT(bp->b_flags & B_BUSY); + KDASSERT(!(bp->b_flags & B_LOCKED)); + KDASSERT(!(bp->b_flags & B_READ)); + KDASSERT(!(bp->b_flags & B_INVAL)); + KDASSERT(!(bp->b_flags & B_NOCACHE)); +#endif + + if (bp->b_error) { +#ifdef notyet /* Can't currently handle possible dirty buffer reuse */ + XXXpooka: interfaces not fully updated + Note: this was not enabled in the original patch + against netbsd4 either. I don't know if comment + above is true or not. + + /* + * If an error occurs, report the error and leave the + * buffer as a delayed write on the LRU queue. + * restarting the write would likely result in + * an error spinloop, so let it be done harmlessly + * by the syncer. + */ + bp->b_flags &= ~(B_DONE); + simple_unlock(&bp->b_interlock); + + if (we->we_error == 0) { + mutex_enter(&wl->wl_mtx); + wl->wl_error_count++; + mutex_exit(&wl->wl_mtx); + cv_broadcast(&wl->wl_reclaimable_cv); + } + we->we_error = bp->b_error; + bp->b_error = 0; + brelse(bp); + return; +#else + /* For now, just mark the log permanently errored out */ + + mutex_enter(&wl->wl_mtx); + if (wl->wl_error_count == 0) { + wl->wl_error_count++; + cv_broadcast(&wl->wl_reclaimable_cv); + } + mutex_exit(&wl->wl_mtx); +#endif + } + + mutex_enter(&wl->wl_mtx); + + KASSERT(we->we_bufcount > 0); + we->we_bufcount--; +#ifdef WAPBL_DEBUG_BUFBYTES + KASSERT(we->we_unsynced_bufbytes >= bp->b_bufsize); + we->we_unsynced_bufbytes -= bp->b_bufsize; + KASSERT(wl->wl_unsynced_bufbytes >= bp->b_bufsize); + wl->wl_unsynced_bufbytes -= bp->b_bufsize; +#endif + + /* + * If the current transaction can be reclaimed, start + * at the beginning and reclaim any consecutive reclaimable + * transactions. If we successfully reclaim anything, + * then wakeup anyone waiting for the reclaim. + */ + if (we->we_bufcount == 0) { + size_t delta = 0; + int errcnt = 0; +#ifdef WAPBL_DEBUG_BUFBYTES + KDASSERT(we->we_unsynced_bufbytes == 0); +#endif + /* + * clear any posted error, since the buffer it came from + * has successfully flushed by now + */ + while ((we = SIMPLEQ_FIRST(&wl->wl_entries)) && + (we->we_bufcount == 0)) { + delta += we->we_reclaimable_bytes; + if (we->we_error) + errcnt++; + SIMPLEQ_REMOVE_HEAD(&wl->wl_entries, we_entries); + wapbl_free(we); + } + + if (delta) { + wl->wl_reclaimable_bytes += delta; + KASSERT(wl->wl_error_count >= errcnt); + wl->wl_error_count -= errcnt; + cv_broadcast(&wl->wl_reclaimable_cv); + } + } + + mutex_exit(&wl->wl_mtx); + brelse(bp, 0); +} + +/* + * Write transactions to disk + start I/O for contents + */ +int +wapbl_flush(struct wapbl *wl, int waitfor) +{ + struct buf *bp; + struct wapbl_entry *we; + off_t off; + off_t head; + off_t tail; + size_t delta = 0; + size_t flushsize; + size_t reserved; + int error = 0; + + /* + * Do a quick check to see if a full flush can be skipped + * This assumes that the flush callback does not need to be called + * unless there are other outstanding bufs. + */ + if (!waitfor) { + size_t nbufs; + mutex_enter(&wl->wl_mtx); /* XXX need mutex here to + protect the KASSERTS */ + nbufs = wl->wl_bufcount; + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bufbytes == 0)); + KASSERT((wl->wl_bufcount == 0) == (wl->wl_bcount == 0)); + mutex_exit(&wl->wl_mtx); + if (nbufs == 0) + return 0; + } + + /* + * XXX we may consider using LK_UPGRADE here + * if we want to call flush from inside a transaction + */ + rw_enter(&wl->wl_rwlock, RW_WRITER); + wl->wl_flush(wl->wl_mount, wl->wl_deallocblks, wl->wl_dealloclens, + wl->wl_dealloccnt); + + /* + * Now that we are fully locked and flushed, + * do another check for nothing to do. + */ + if (wl->wl_bufcount == 0) { + goto out; + } + +#if 0 + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush thread %d.%d flushing entries with " + "bufcount=%zu bufbytes=%zu\n", + curproc->p_pid, curlwp->l_lid, wl->wl_bufcount, + wl->wl_bufbytes)); +#endif + + /* Calculate amount of space needed to flush */ + flushsize = wapbl_transaction_len(wl); + + if (flushsize > (wl->wl_circ_size - wl->wl_reserved_bytes)) { + /* + * XXX this could be handled more gracefully, perhaps place + * only a partial transaction in the log and allow the + * remaining to flush without the protection of the journal. + */ + panic("wapbl_flush: current transaction too big to flush\n"); + } + + error = wapbl_truncate(wl, flushsize, 0); + if (error) + goto out2; + + off = wl->wl_head; + KASSERT((off == 0) || ((off >= wl->wl_circ_off) && + (off < wl->wl_circ_off+wl->wl_circ_size))); + error = wapbl_write_blocks(wl, &off); + if (error) + goto out2; + error = wapbl_write_revocations(wl, &off); + if (error) + goto out2; + error = wapbl_write_inodes(wl, &off); + if (error) + goto out2; + + reserved = 0; + if (wl->wl_inohashcnt) + reserved = wapbl_transaction_inodes_len(wl); + + head = wl->wl_head; + tail = wl->wl_tail; + + wapbl_advance_head(wl->wl_circ_size, wl->wl_circ_off, flushsize, + &head, &tail); +#ifdef WAPBL_DEBUG + if (head != off) { + panic("lost head! head=%"PRIdMAX" tail=%" PRIdMAX + " off=%"PRIdMAX" flush=%zu\n", + (intmax_t)head, (intmax_t)tail, (intmax_t)off, + flushsize); + } +#else + KASSERT(head == off); +#endif + + /* Opportunistically move the tail forward if we can */ + if (!wapbl_lazy_truncate) { + mutex_enter(&wl->wl_mtx); + delta = wl->wl_reclaimable_bytes; + mutex_exit(&wl->wl_mtx); + wapbl_advance_tail(wl->wl_circ_size, wl->wl_circ_off, delta, + &head, &tail); + } + + error = wapbl_write_commit(wl, head, tail); + if (error) + goto out2; + + /* poolme? or kmemme? */ + we = wapbl_calloc(1, sizeof(*we)); + +#ifdef WAPBL_DEBUG_BUFBYTES + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" + " unsynced=%zu" + "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " + "inodes=%d\n", + curproc->p_pid, curlwp->l_lid, flushsize, delta, + wapbl_space_used(wl->wl_circ_size, head, tail), + wl->wl_unsynced_bufbytes, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, wl->wl_dealloccnt, + wl->wl_inohashcnt)); +#else + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush: thread %d.%d head+=%zu tail+=%zu used=%zu" + "\n\tbufcount=%zu bufbytes=%zu bcount=%zu deallocs=%d " + "inodes=%d\n", + curproc->p_pid, curlwp->l_lid, flushsize, delta, + wapbl_space_used(wl->wl_circ_size, head, tail), + wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, + wl->wl_dealloccnt, wl->wl_inohashcnt)); +#endif + + + mutex_enter(&bufcache_lock); + mutex_enter(&wl->wl_mtx); + + wl->wl_reserved_bytes = reserved; + wl->wl_head = head; + wl->wl_tail = tail; + KASSERT(wl->wl_reclaimable_bytes >= delta); + wl->wl_reclaimable_bytes -= delta; + wl->wl_dealloccnt=0; +#ifdef WAPBL_DEBUG_BUFBYTES + wl->wl_unsynced_bufbytes += wl->wl_bufbytes; +#endif + + we->we_wapbl = wl; + we->we_bufcount = wl->wl_bufcount; +#ifdef WAPBL_DEBUG_BUFBYTES + we->we_unsynced_bufbytes = wl->wl_bufbytes; +#endif + we->we_reclaimable_bytes = flushsize; + we->we_error = 0; + SIMPLEQ_INSERT_TAIL(&wl->wl_entries, we, we_entries); + + /* + * this flushes bufs in reverse order than they were queued + * it shouldn't matter, but if we care we could use TAILQ instead. + * XXX Note they will get put on the lru queue when they flush + * so we might actually want to change this to preserve order. + */ + while ((bp = LIST_FIRST(&wl->wl_bufs)) != NULL) { + if (bbusy(bp, 0, 0, &wl->wl_mtx)) { + continue; + } + bp->b_iodone = wapbl_biodone; + bp->b_private = we; + bremfree(bp); + wapbl_remove_buf_locked(wl, bp); + mutex_exit(&wl->wl_mtx); + mutex_exit(&bufcache_lock); + bawrite(bp); + mutex_enter(&bufcache_lock); + mutex_enter(&wl->wl_mtx); + } + mutex_exit(&wl->wl_mtx); + mutex_exit(&bufcache_lock); + +#if 0 + WAPBL_PRINTF(WAPBL_PRINT_FLUSH, + ("wapbl_flush thread %d.%d done flushing entries...\n", + curproc->p_pid, curlwp->l_lid)); +#endif + + out: + + /* + * If the waitfor flag is set, don't return until everything is + * fully flushed and the on disk log is empty. + */ + if (waitfor) { + error = wapbl_truncate(wl, wl->wl_circ_size - + wl->wl_reserved_bytes, wapbl_lazy_truncate); + } + + out2: + if (error) { + wl->wl_flush_abort(wl->wl_mount, wl->wl_deallocblks, + wl->wl_dealloclens, wl->wl_dealloccnt); + } + +#ifdef WAPBL_DEBUG_PRINT + if (error) { + pid_t pid = -1; + lwpid_t lid = -1; + if (curproc) + pid = curproc->p_pid; + if (curlwp) + lid = curlwp->l_lid; + mutex_enter(&wl->wl_mtx); +#ifdef WAPBL_DEBUG_BUFBYTES + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_flush: thread %d.%d aborted flush: " + "error = %d\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %d, reclaimable=%zu reserved=%zu " + "unsynced=%zu\n", + pid, lid, error, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, + wl->wl_dealloccnt, wl->wl_inohashcnt, + wl->wl_error_count, wl->wl_reclaimable_bytes, + wl->wl_reserved_bytes, wl->wl_unsynced_bufbytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d, unsynced = %zu\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error, we->we_unsynced_bufbytes)); + } +#else + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_flush: thread %d.%d aborted flush: " + "error = %d\n" + "\tbufcount=%zu bufbytes=%zu bcount=%zu " + "deallocs=%d inodes=%d\n" + "\terrcnt = %d, reclaimable=%zu reserved=%zu\n", + pid, lid, error, wl->wl_bufcount, + wl->wl_bufbytes, wl->wl_bcount, + wl->wl_dealloccnt, wl->wl_inohashcnt, + wl->wl_error_count, wl->wl_reclaimable_bytes, + wl->wl_reserved_bytes)); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("\tentry: bufcount = %zu, reclaimable = %zu, " + "error = %d\n", we->we_bufcount, + we->we_reclaimable_bytes, we->we_error)); + } +#endif + mutex_exit(&wl->wl_mtx); + } +#endif + + rw_exit(&wl->wl_rwlock); + return error; +} + +/****************************************************************/ + +void +wapbl_jlock_assert(struct wapbl *wl) +{ + +#ifdef WAPBL_DEBUG_SERIALIZE + KASSERT(rw_write_held(&wl->wl_rwlock)); +#else + KASSERT(rw_read_held(&wl->wl_rwlock) || rw_write_held(&wl->wl_rwlock)); +#endif +} + +void +wapbl_junlock_assert(struct wapbl *wl) +{ + +#ifdef WAPBL_DEBUG_SERIALIZE + KASSERT(!rw_write_held(&wl->wl_rwlock)); +#endif +} + +/****************************************************************/ + +/* locks missing */ +void +wapbl_print(struct wapbl *wl, + int full, + void (*pr)(const char *, ...)) +{ + struct buf *bp; + struct wapbl_entry *we; + (*pr)("wapbl %p", wl); + (*pr)("\nlogvp = %p, devvp = %p, logpbn = %"PRId64"\n", + wl->wl_logvp, wl->wl_devvp, wl->wl_logpbn); + (*pr)("circ = %zu, header = %zu, head = %"PRIdMAX" tail = %"PRIdMAX"\n", + wl->wl_circ_size, wl->wl_circ_off, + (intmax_t)wl->wl_head, (intmax_t)wl->wl_tail); + (*pr)("fs_dev_bshift = %d, log_dev_bshift = %d\n", + wl->wl_log_dev_bshift, wl->wl_fs_dev_bshift); +#ifdef WAPBL_DEBUG_BUFBYTES + (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " + "reserved = %zu errcnt = %d unsynced = %zu\n", + wl->wl_bufcount, wl->wl_bufbytes, wl->wl_bcount, + wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, + wl->wl_error_count, wl->wl_unsynced_bufbytes); +#else + (*pr)("bufcount = %zu, bufbytes = %zu bcount = %zu reclaimable = %zu " + "reserved = %zu errcnt = %d\n", wl->wl_bufcount, wl->wl_bufbytes, + wl->wl_bcount, wl->wl_reclaimable_bytes, wl->wl_reserved_bytes, + wl->wl_error_count); +#endif + (*pr)("\tdealloccnt = %d, dealloclim = %d\n", + wl->wl_dealloccnt, wl->wl_dealloclim); + (*pr)("\tinohashcnt = %d, inohashmask = 0x%08x\n", + wl->wl_inohashcnt, wl->wl_inohashmask); + (*pr)("entries:\n"); + SIMPLEQ_FOREACH(we, &wl->wl_entries, we_entries) { +#ifdef WAPBL_DEBUG_BUFBYTES + (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d, " + "unsynced = %zu\n", + we->we_bufcount, we->we_reclaimable_bytes, + we->we_error, we->we_unsynced_bufbytes); +#else + (*pr)("\tbufcount = %zu, reclaimable = %zu, error = %d\n", + we->we_bufcount, we->we_reclaimable_bytes, we->we_error); +#endif + } + if (full) { + int cnt = 0; + (*pr)("bufs ="); + LIST_FOREACH(bp, &wl->wl_bufs, b_wapbllist) { + if (!LIST_NEXT(bp, b_wapbllist)) { + (*pr)(" %p", bp); + } else if ((++cnt % 6) == 0) { + (*pr)(" %p,\n\t", bp); + } else { + (*pr)(" %p,", bp); + } + } + (*pr)("\n"); + + (*pr)("dealloced blks = "); + { + int i; + cnt = 0; + for (i=0;iwl_dealloccnt;i++) { + (*pr)(" %"PRId64":%d,", + wl->wl_deallocblks[i], + wl->wl_dealloclens[i]); + if ((++cnt % 4) == 0) { + (*pr)("\n\t"); + } + } + } + (*pr)("\n"); + + (*pr)("registered inodes = "); + { + int i; + cnt = 0; + for (i=0;i<=wl->wl_inohashmask;i++) { + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + + wih = &wl->wl_inohash[i]; + LIST_FOREACH(wi, wih, wi_hash) { + if (wi->wi_ino == 0) + continue; + (*pr)(" %"PRId32"/0%06"PRIo32",", + wi->wi_ino, wi->wi_mode); + if ((++cnt % 4) == 0) { + (*pr)("\n\t"); + } + } + } + (*pr)("\n"); + } + } +} + +#if defined(WAPBL_DEBUG) || defined(DDB) +void +wapbl_dump(struct wapbl *wl) +{ +#if defined(WAPBL_DEBUG) + if (!wl) + wl = wapbl_debug_wl; +#endif + if (!wl) + return; + wapbl_print(wl, 1, printf); +} +#endif + +/****************************************************************/ + +void +wapbl_register_deallocation(struct wapbl *wl, daddr_t blk, int len) +{ + + wapbl_jlock_assert(wl); + + /* XXX should eventually instead tie this into resource estimation */ + /* XXX this KASSERT needs locking/mutex analysis */ + KASSERT(wl->wl_dealloccnt < wl->wl_dealloclim); + wl->wl_deallocblks[wl->wl_dealloccnt] = blk; + wl->wl_dealloclens[wl->wl_dealloccnt] = len; + wl->wl_dealloccnt++; + WAPBL_PRINTF(WAPBL_PRINT_ALLOC, + ("wapbl_register_deallocation: blk=%"PRId64" len=%d\n", blk, len)); +} + +/****************************************************************/ + +/* + * Singleton pool init + */ +static void +wapbl_pool_init(int *refcnt, struct pool *pp, size_t size, const char *wchan) +{ + + mutex_enter(&wapbl_global_mtx); + if ((*refcnt)++ == 0) + pool_init(pp, size, 0, 0, 0, wchan, + &pool_allocator_nointr, IPL_NONE); + mutex_exit(&wapbl_global_mtx); +} + +static void +wapbl_pool_done(volatile int *refcnt, struct pool *pp) +{ + + mutex_enter(&wapbl_global_mtx); + if (--(*refcnt) == 0) + pool_destroy(pp); + mutex_exit(&wapbl_global_mtx); +} + +static void +wapbl_inodetrk_init(struct wapbl *wl, u_int size) +{ + + wl->wl_inohash = hashinit(size, HASH_LIST, M_WAPBL, M_WAITOK, + &wl->wl_inohashmask); + wapbl_pool_init(&wapbl_ino_pool_refcount, &wapbl_ino_pool, + sizeof(struct wapbl_ino), "wapblinopl"); +} + +static void +wapbl_inodetrk_free(struct wapbl *wl) +{ + + /* XXX this KASSERT needs locking/mutex analysis */ + KASSERT(wl->wl_inohashcnt == 0); + hashdone(wl->wl_inohash, M_WAPBL); + wapbl_pool_done(&wapbl_ino_pool_refcount, &wapbl_ino_pool); +} + +static struct wapbl_ino * +wapbl_inodetrk_get(struct wapbl *wl, ino_t ino) +{ + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + + KASSERT(mutex_owned(&wl->wl_mtx)); + + wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; + LIST_FOREACH(wi, wih, wi_hash) { + if (ino == wi->wi_ino) + return wi; + } + return 0; +} + +void +wapbl_register_inode(struct wapbl *wl, ino_t ino, mode_t mode) +{ + struct wapbl_ino_head *wih; + struct wapbl_ino *wi = pool_get(&wapbl_ino_pool, PR_WAITOK); + + mutex_enter(&wl->wl_mtx); + if (wapbl_inodetrk_get(wl, ino)) { + pool_put(&wapbl_ino_pool, wi); + } else { + wi->wi_ino = ino; + wi->wi_mode = mode; + wih = &wl->wl_inohash[ino & wl->wl_inohashmask]; + LIST_INSERT_HEAD(wih, wi, wi_hash); + wl->wl_inohashcnt++; + WAPBL_PRINTF(WAPBL_PRINT_INODE, + ("wapbl_register_inode: ino=%"PRId64"\n", ino)); + } + mutex_exit(&wl->wl_mtx); +} + +void +wapbl_unregister_inode(struct wapbl *wl, ino_t ino, mode_t mode) +{ + struct wapbl_ino *wi; + + mutex_enter(&wl->wl_mtx); + wi = wapbl_inodetrk_get(wl, ino); + if (wi) { + WAPBL_PRINTF(WAPBL_PRINT_INODE, + ("wapbl_unregister_inode: ino=%"PRId64"\n", ino)); + KASSERT(wl->wl_inohashcnt > 0); + wl->wl_inohashcnt--; + LIST_REMOVE(wi, wi_hash); + mutex_exit(&wl->wl_mtx); + + pool_put(&wapbl_ino_pool, wi); + } else { + mutex_exit(&wl->wl_mtx); + } +} + +/****************************************************************/ + +static __inline size_t +wapbl_transaction_inodes_len(struct wapbl *wl) +{ + int blocklen = 1<wl_log_dev_bshift; + int iph; + + /* Calculate number of inodes described in a inodelist header */ + iph = (blocklen-offsetof(struct wapbl_wc_inodelist, wc_inodes))/ + sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); + + KASSERT(iph > 0); + + return MAX(1, howmany(wl->wl_inohashcnt, iph))*blocklen; +} + + +/* Calculate amount of space a transaction will take on disk */ +static size_t +wapbl_transaction_len(struct wapbl *wl) +{ + int blocklen = 1<wl_log_dev_bshift; + size_t len; + int bph; + + /* Calculate number of blocks described in a blocklist header */ + bph = (blocklen-offsetof(struct wapbl_wc_blocklist, wc_blocks))/ + sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); + + KASSERT(bph > 0); + + len = wl->wl_bcount; + len += howmany(wl->wl_bufcount, bph)*blocklen; + len += howmany(wl->wl_dealloccnt, bph)*blocklen; + len += wapbl_transaction_inodes_len(wl); + + return len; +} + +/* + * Perform commit operation + * + * Note that generation number incrementation needs to + * be protected against racing with other invocations + * of wapbl_commit. This is ok since this routine + * is only invoked from wapbl_flush + */ +static int +wapbl_write_commit(struct wapbl *wl, off_t head, off_t tail) +{ + struct wapbl_wc_header *wc = wl->wl_wc_header; + struct timespec ts; + int error; + int force = 1; + + /* XXX Calc checksum here, instead we do this for now */ + error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); + if (error) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " + "returned %d\n", wl->wl_devvp->v_rdev, error)); + } + + wc->wc_head = head; + wc->wc_tail = tail; + wc->wc_checksum = 0; + wc->wc_version = 1; + getnanotime(&ts); /* XXX need higher resolution time here? */ + wc->wc_time = ts.tv_sec;; + wc->wc_timensec = ts.tv_nsec; + + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_commit: head = %"PRIdMAX "tail = %"PRIdMAX"\n", + (intmax_t)head, (intmax_t)tail)); + + /* + * XXX if generation will rollover, then first zero + * over second commit header before trying to write both headers. + */ + + error = wapbl_write(wc, wc->wc_len, wl->wl_devvp, + wl->wl_logpbn + wc->wc_generation % 2); + if (error) + return error; + + error = VOP_IOCTL(wl->wl_devvp, DIOCCACHESYNC, &force, FWRITE, FSCRED); + if (error) { + WAPBL_PRINTF(WAPBL_PRINT_ERROR, + ("wapbl_write_commit: DIOCCACHESYNC on dev 0x%x " + "returned %d\n", wl->wl_devvp->v_rdev, error)); + } + + /* + * If the generation number was zero, write it out a second time. + * This handles initialization and generation number rollover + */ + if (wc->wc_generation++ == 0) { + error = wapbl_write_commit(wl, head, tail); + /* + * This panic should be able to be removed if we do the + * zero'ing mentioned above, and we are certain to roll + * back generation number on failure. + */ + if (error) + panic("wapbl_write_commit: error writing duplicate " + "log header: %d\n", error); + } + return 0; +} + +/* Returns new offset value */ +static int +wapbl_write_blocks(struct wapbl *wl, off_t *offp) +{ + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; + int blocklen = 1<wl_log_dev_bshift; + int bph; + struct buf *bp; + off_t off = *offp; + int error; + + KASSERT(rw_write_held(&wl->wl_rwlock)); + + bph = (blocklen-offsetof(struct wapbl_wc_blocklist, wc_blocks)) / + sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); + + bp = LIST_FIRST(&wl->wl_bufs); + + while (bp) { + int cnt; + struct buf *obp = bp; + + KASSERT(bp->b_flags & B_LOCKED); + + wc->wc_type = WAPBL_WC_BLOCKS; + wc->wc_len = blocklen; + wc->wc_blkcount = 0; + while (bp && (wc->wc_blkcount < bph)) { + /* + * Make sure all the physical block numbers are up to + * date. If this is not always true on a given + * filesystem, then VOP_BMAP must be called. We + * could call VOP_BMAP here, or else in the filesystem + * specific flush callback, although neither of those + * solutions allow us to take the vnode lock. If a + * filesystem requires that we must take the vnode lock + * to call VOP_BMAP, then we can probably do it in + * bwrite when the vnode lock should already be held + * by the invoking code. + */ + KASSERT((bp->b_vp->v_type == VBLK) || + (bp->b_blkno != bp->b_lblkno)); + KASSERT(bp->b_blkno > 0); + + wc->wc_blocks[wc->wc_blkcount].wc_daddr = bp->b_blkno; + wc->wc_blocks[wc->wc_blkcount].wc_dlen = bp->b_bcount; + wc->wc_len += bp->b_bcount; + wc->wc_blkcount++; + bp = LIST_NEXT(bp, b_wapbllist); + } + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_blocks: len = %u off = %"PRIdMAX"\n", + wc->wc_len, (intmax_t)off)); + + error = wapbl_circ_write(wl, wc, blocklen, &off); + if (error) + return error; + bp = obp; + cnt = 0; + while (bp && (cnt++ < bph)) { + error = wapbl_circ_write(wl, bp->b_data, + bp->b_bcount, &off); + if (error) + return error; + bp = LIST_NEXT(bp, b_wapbllist); + } + } + *offp = off; + return 0; +} + +static int +wapbl_write_revocations(struct wapbl *wl, off_t *offp) +{ + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wl->wl_wc_scratch; + int i; + int blocklen = 1<wl_log_dev_bshift; + int bph; + off_t off = *offp; + int error; + + if (wl->wl_dealloccnt == 0) + return 0; + + bph = (blocklen-offsetof(struct wapbl_wc_blocklist, wc_blocks)) / + sizeof(((struct wapbl_wc_blocklist *)0)->wc_blocks[0]); + + i = 0; + while (i < wl->wl_dealloccnt) { + wc->wc_type = WAPBL_WC_REVOCATIONS; + wc->wc_len = blocklen; + wc->wc_blkcount = 0; + while ((i < wl->wl_dealloccnt) && (wc->wc_blkcount < bph)) { + wc->wc_blocks[wc->wc_blkcount].wc_daddr = + wl->wl_deallocblks[i]; + wc->wc_blocks[wc->wc_blkcount].wc_dlen = + wl->wl_dealloclens[i]; + wc->wc_blkcount++; + i++; + } + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_revocations: len = %u off = %"PRIdMAX"\n", + wc->wc_len, (intmax_t)off)); + error = wapbl_circ_write(wl, wc, blocklen, &off); + if (error) + return error; + } + *offp = off; + return 0; +} + +static int +wapbl_write_inodes(struct wapbl *wl, off_t *offp) +{ + struct wapbl_wc_inodelist *wc = + (struct wapbl_wc_inodelist *)wl->wl_wc_scratch; + int i; + int blocklen = 1<wl_log_dev_bshift; + off_t off = *offp; + int error; + + struct wapbl_ino_head *wih; + struct wapbl_ino *wi; + int iph; + + iph = (blocklen-offsetof(struct wapbl_wc_inodelist, wc_inodes)) / + sizeof(((struct wapbl_wc_inodelist *)0)->wc_inodes[0]); + + i = 0; + wih = &wl->wl_inohash[0]; + wi = 0; + do { + wc->wc_type = WAPBL_WC_INODES; + wc->wc_len = blocklen; + wc->wc_inocnt = 0; + wc->wc_clear = (i == 0); + while ((i < wl->wl_inohashcnt) && (wc->wc_inocnt < iph)) { + while (!wi) { + KASSERT((wih - &wl->wl_inohash[0]) + <= wl->wl_inohashmask); + wi = LIST_FIRST(wih++); + } + wc->wc_inodes[wc->wc_inocnt].wc_inumber = wi->wi_ino; + wc->wc_inodes[wc->wc_inocnt].wc_imode = wi->wi_mode; + wc->wc_inocnt++; + i++; + wi = LIST_NEXT(wi, wi_hash); + } + WAPBL_PRINTF(WAPBL_PRINT_WRITE, + ("wapbl_write_inodes: len = %u off = %"PRIdMAX"\n", + wc->wc_len, (intmax_t)off)); + error = wapbl_circ_write(wl, wc, blocklen, &off); + if (error) + return error; + } while (i < wl->wl_inohashcnt); + + *offp = off; + return 0; +} + +#endif /* _KERNEL */ + +/****************************************************************/ + +#ifdef _KERNEL +static struct pool wapbl_blk_pool; +static int wapbl_blk_pool_refcount; +#endif +struct wapbl_blk { + LIST_ENTRY(wapbl_blk) wb_hash; + daddr_t wb_blk; + off_t wb_off; /* Offset of this block in the log */ +}; +#define WAPBL_BLKPOOL_MIN 83 + +static void +wapbl_blkhash_init(struct wapbl_replay *wr, u_int size) +{ + if (size < WAPBL_BLKPOOL_MIN) + size = WAPBL_BLKPOOL_MIN; + KASSERT(wr->wr_blkhash == 0); +#ifdef _KERNEL + wr->wr_blkhash = hashinit(size, HASH_LIST, M_WAPBL, M_WAITOK, + &wr->wr_blkhashmask); + wapbl_pool_init(&wapbl_blk_pool_refcount, &wapbl_blk_pool, + sizeof(struct wapbl_blk), "wapblblkpl"); +#else /* ! _KERNEL */ + /* Manually implement hashinit */ + { + int i; + unsigned long hashsize; + for (hashsize = 1; hashsize < size; hashsize <<= 1) + continue; + wr->wr_blkhash = wapbl_malloc(hashsize * sizeof(*wr->wr_blkhash)); + for (i = 0; i < wr->wr_blkhashmask; i++) + LIST_INIT(&wr->wr_blkhash[i]); + wr->wr_blkhashmask = hashsize-1; + } +#endif /* ! _KERNEL */ +} + +static void +wapbl_blkhash_free(struct wapbl_replay *wr) +{ + KASSERT(wr->wr_blkhashcnt == 0); +#ifdef _KERNEL + hashdone(wr->wr_blkhash, M_WAPBL); + wapbl_pool_done(&wapbl_blk_pool_refcount, &wapbl_blk_pool); +#else /* ! _KERNEL */ + wapbl_free(wr->wr_blkhash); +#endif /* ! _KERNEL */ +} + +static struct wapbl_blk * +wapbl_blkhash_get(struct wapbl_replay *wr, daddr_t blk) +{ + struct wapbl_blk_head *wbh; + struct wapbl_blk *wb; + wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; + LIST_FOREACH(wb, wbh, wb_hash) { + if (blk == wb->wb_blk) + return wb; + } + return 0; +} + +static void +wapbl_blkhash_ins(struct wapbl_replay *wr, daddr_t blk, off_t off) +{ + struct wapbl_blk_head *wbh; + struct wapbl_blk *wb; + wb = wapbl_blkhash_get(wr, blk); + if (wb) { + KASSERT(wb->wb_blk == blk); + wb->wb_off = off; + } else { +#ifdef _KERNEL + wb = pool_get(&wapbl_blk_pool, PR_WAITOK); +#else /* ! _KERNEL */ + wb = wapbl_malloc(sizeof(*wb)); +#endif /* ! _KERNEL */ + wb->wb_blk = blk; + wb->wb_off = off; + wbh = &wr->wr_blkhash[blk & wr->wr_blkhashmask]; + LIST_INSERT_HEAD(wbh, wb, wb_hash); + wr->wr_blkhashcnt++; + } +} + +static void +wapbl_blkhash_rem(struct wapbl_replay *wr, daddr_t blk) +{ + struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); + if (wb) { + KASSERT(wr->wr_blkhashcnt > 0); + wr->wr_blkhashcnt--; + LIST_REMOVE(wb, wb_hash); +#ifdef _KERNEL + pool_put(&wapbl_blk_pool, wb); +#else /* ! _KERNEL */ + wapbl_free(wb); +#endif /* ! _KERNEL */ + } +} + +static void +wapbl_blkhash_clear(struct wapbl_replay *wr) +{ + int i; + for (i = 0; i <= wr->wr_blkhashmask; i++) { + struct wapbl_blk *wb; + + while ((wb = LIST_FIRST(&wr->wr_blkhash[i]))) { + KASSERT(wr->wr_blkhashcnt > 0); + wr->wr_blkhashcnt--; + LIST_REMOVE(wb, wb_hash); +#ifdef _KERNEL + pool_put(&wapbl_blk_pool, wb); +#else /* ! _KERNEL */ + wapbl_free(wb); +#endif /* ! _KERNEL */ + } + } + KASSERT(wr->wr_blkhashcnt == 0); +} + +/****************************************************************/ + +static int +wapbl_circ_read(struct wapbl_replay *wr, void *data, size_t len, off_t *offp) +{ + size_t slen; + struct wapbl_wc_header *wc = &wr->wr_wc_header; + off_t off = *offp; + int error; + + KASSERT(((len >> wc->wc_log_dev_bshift) << + wc->wc_log_dev_bshift) == len); + if (off < wc->wc_circ_off) + off = wc->wc_circ_off; + slen = wc->wc_circ_off + wc->wc_circ_size - off; + if (slen < len) { + error = wapbl_read(data, slen, wr->wr_devvp, + wr->wr_logpbn + (off >> wc->wc_log_dev_bshift)); + if (error) + return error; + data = (uint8_t *)data + slen; + len -= slen; + off = wc->wc_circ_off; + } + error = wapbl_read(data, len, wr->wr_devvp, + wr->wr_logpbn + (off >> wc->wc_log_dev_bshift)); + if (error) + return error; + off += len; + if (off >= wc->wc_circ_off + wc->wc_circ_size) + off = wc->wc_circ_off; + *offp = off; + return 0; +} + +static void +wapbl_circ_advance(struct wapbl_replay *wr, size_t len, off_t *offp) +{ + size_t slen; + struct wapbl_wc_header *wc = &wr->wr_wc_header; + off_t off = *offp; + + KASSERT(((len >> wc->wc_log_dev_bshift) << + wc->wc_log_dev_bshift) == len); + + if (off < wc->wc_circ_off) + off = wc->wc_circ_off; + slen = wc->wc_circ_off + wc->wc_circ_size - off; + if (slen < len) { + len -= slen; + off = wc->wc_circ_off; + } + off += len; + if (off >= wc->wc_circ_off + wc->wc_circ_size) + off = wc->wc_circ_off; + *offp = off; +} + +/****************************************************************/ + +int +wapbl_replay_start(struct wapbl_replay **wrp, struct vnode *vp, + daddr_t off, size_t count, size_t blksize) +{ + struct wapbl_replay *wr; + int error; + struct vnode *devvp; + daddr_t logpbn; + uint8_t *scratch; + struct wapbl_wc_header *wch; + struct wapbl_wc_header *wch2; + /* Use this until we read the actual log header */ + int log_dev_bshift = DEV_BSHIFT; + size_t used; + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, + ("wapbl_replay_start: vp=%p off=%"PRId64 " count=%zu blksize=%zu\n", + vp, off, count, blksize)); + + if (off < 0) + return EINVAL; + + if (blksize < DEV_BSIZE) + return EINVAL; + if (blksize % DEV_BSIZE) + return EINVAL; + +#ifdef _KERNEL +#if 0 + /* XXX vp->v_size isn't reliably set for VBLK devices, + * especially root. However, we might still want to verify + * that the full load is readable */ + if ((off+count)*blksize > vp->v_size) + return EINVAL; +#endif + + if ((error = VOP_BMAP(vp, off, &devvp, &logpbn, 0)) != 0) { + return error; + } +#else /* ! _KERNEL */ + devvp = vp; + logpbn = off; +#endif /* ! _KERNEL */ + + scratch = wapbl_malloc(MAXBSIZE); + + error = wapbl_read(scratch, 2<wc_type != WAPBL_WC_HEADER) { + printf("Unrecognized wapbl magic: 0x%08x\n", wch->wc_type); + error = EFTYPE; + goto errout; + } + + if (wch2->wc_generation > wch->wc_generation) + wch = wch2; + + wr = wapbl_calloc(1, sizeof(*wr)); + + wr->wr_logvp = vp; + wr->wr_devvp = devvp; + wr->wr_logpbn = logpbn; + + wr->wr_scratch = scratch; + + memcpy(&wr->wr_wc_header, wch, sizeof(wr->wr_wc_header)); + + used = wapbl_space_used(wch->wc_circ_size, wch->wc_head, wch->wc_tail); + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, + ("wapbl_replay: head=%"PRId64" tail=%"PRId64" off=%"PRId64 + " len=%"PRId64" used=%zu\n", + wch->wc_head, wch->wc_tail, wch->wc_circ_off, + wch->wc_circ_size, used)); + + wapbl_blkhash_init(wr, (used >> wch->wc_fs_dev_bshift)); + error = wapbl_replay_prescan(wr); + if (error) { + wapbl_replay_stop(wr); + wapbl_replay_free(wr); + return error; + } + + error = wapbl_replay_get_inodes(wr); + if (error) { + wapbl_replay_stop(wr); + wapbl_replay_free(wr); + return error; + } + + *wrp = wr; + return 0; + + errout: + wapbl_free(scratch); + return error; +} + +void +wapbl_replay_stop(struct wapbl_replay *wr) +{ + + WAPBL_PRINTF(WAPBL_PRINT_REPLAY, ("wapbl_replay_stop called\n")); + + KDASSERT(wapbl_replay_isopen(wr)); + + wapbl_free(wr->wr_scratch); + wr->wr_scratch = 0; + + wr->wr_logvp = 0; + + wapbl_blkhash_clear(wr); + wapbl_blkhash_free(wr); +} + +void +wapbl_replay_free(struct wapbl_replay *wr) +{ + + KDASSERT(!wapbl_replay_isopen(wr)); + + if (wr->wr_inodes) + wapbl_free(wr->wr_inodes); + wapbl_free(wr); +} + +int +wapbl_replay_isopen1(struct wapbl_replay *wr) +{ + + return wapbl_replay_isopen(wr); +} + +static int +wapbl_replay_prescan(struct wapbl_replay *wr) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int error; + + int logblklen = 1<wc_log_dev_bshift; + int fsblklen = 1<wc_fs_dev_bshift; + + wapbl_blkhash_clear(wr); + + off = wch->wc_tail; + while (off != wch->wc_head) { + struct wapbl_wc_null *wcn; + off_t saveoff = off; + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) + goto errout; + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int j, n; + /* + * Enter each physical block into the + * hashtable independently + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + wapbl_blkhash_ins(wr, + wc->wc_blocks[i].wc_daddr+j, + off); + wapbl_circ_advance(wr, + fsblklen, &off); + } + } + } + break; + + case WAPBL_WC_REVOCATIONS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int j, n; + /* + * Remove any blocks found from the + * hashtable + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + wapbl_blkhash_rem(wr, + wc->wc_blocks[i].wc_daddr+j); + } + } + } + break; + + case WAPBL_WC_INODES: + { + struct wapbl_wc_inodelist *wc = + (struct wapbl_wc_inodelist *)wr->wr_scratch; + /* + * Keep track of where we found this so we + * can use it later + */ + if (wc->wc_clear) { + wr->wr_inodestail = saveoff; + wr->wr_inodescnt = 0; + } + if (wr->wr_inodestail) + wr->wr_inodeshead = off; + wr->wr_inodescnt += wc->wc_inocnt; + } + break; + default: + printf("Unrecognized wapbl type: 0x%08x\n", + wcn->wc_type); + error = EFTYPE; + goto errout; + } + wapbl_circ_advance(wr, wcn->wc_len, &saveoff); + if (off != saveoff) { + printf("wapbl_replay: corrupted records\n"); + error = EFTYPE; + goto errout; + } + } + return 0; + + errout: + wapbl_blkhash_clear(wr); + return error; +} + +static int +wapbl_replay_get_inodes(struct wapbl_replay *wr) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int logblklen = 1<wc_log_dev_bshift; + int cnt= 0; + + KDASSERT(wapbl_replay_isopen(wr)); + + if (wr->wr_inodescnt == 0) + return 0; + + KASSERT(!wr->wr_inodes); + + wr->wr_inodes = wapbl_malloc(wr->wr_inodescnt*sizeof(wr->wr_inodes[0])); + + off = wr->wr_inodestail; + + while (off != wr->wr_inodeshead) { + struct wapbl_wc_null *wcn; + int error; + off_t saveoff = off; + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) { + wapbl_free(wr->wr_inodes); + wr->wr_inodes = 0; + return error; + } + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + case WAPBL_WC_REVOCATIONS: + break; + case WAPBL_WC_INODES: + { + struct wapbl_wc_inodelist *wc = + (struct wapbl_wc_inodelist *)wr->wr_scratch; + /* + * Keep track of where we found this so we + * can use it later + */ + if (wc->wc_clear) { + cnt = 0; + } + /* This memcpy assumes that wr_inodes is + * laid out the same as wc_inodes. */ + memcpy(&wr->wr_inodes[cnt], wc->wc_inodes, + wc->wc_inocnt*sizeof(wc->wc_inodes[0])); + cnt += wc->wc_inocnt; + } + break; + default: + KASSERT(0); + } + off = saveoff; + wapbl_circ_advance(wr, wcn->wc_len, &off); + } + KASSERT(cnt == wr->wr_inodescnt); + return 0; +} + +#ifdef DEBUG +int +wapbl_replay_verify(struct wapbl_replay *wr, struct vnode *fsdevvp) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int mismatchcnt = 0; + int logblklen = 1<wc_log_dev_bshift; + int fsblklen = 1<wc_fs_dev_bshift; + void *scratch1 = wapbl_malloc(MAXBSIZE); + void *scratch2 = wapbl_malloc(MAXBSIZE); + int error = 0; + + KDASSERT(wapbl_replay_isopen(wr)); + + off = wch->wc_tail; + while (off != wch->wc_head) { + struct wapbl_wc_null *wcn; +#ifdef DEBUG + off_t saveoff = off; +#endif + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) + goto out; + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int foundcnt = 0; + int dirtycnt = 0; + int j, n; + /* + * Check each physical block into the + * hashtable independently + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + struct wapbl_blk *wb = + wapbl_blkhash_get(wr, + wc->wc_blocks[i].wc_daddr+j); + if (wb && (wb->wb_off == off)) { + foundcnt++; + error = + wapbl_circ_read(wr, + scratch1, fsblklen, + &off); + if (error) + goto out; + error = + wapbl_read(scratch2, + fsblklen, fsdevvp, + wb->wb_blk); + if (error) + goto out; + if (memcmp(scratch1, + scratch2, + fsblklen)) { + printf( + "wapbl_verify: mismatch block %"PRId64" at off %"PRIdMAX"\n", + wb->wb_blk, (intmax_t)off); + dirtycnt++; + mismatchcnt++; + } + } else { + wapbl_circ_advance(wr, + fsblklen, &off); + } + } +#if 0 + /* + * If all of the blocks in an entry + * are clean, then remove all of its + * blocks from the hashtable since they + * never will need replay. + */ + if ((foundcnt != 0) && + (dirtycnt == 0)) { + off = saveoff; + wapbl_circ_advance(wr, + logblklen, &off); + for (j = 0; j < n; j++) { + struct wapbl_blk *wb = + wapbl_blkhash_get(wr, + wc->wc_blocks[i].wc_daddr+j); + if (wb && + (wb->wb_off == off)) { + wapbl_blkhash_rem(wr, wb->wb_blk); + } + wapbl_circ_advance(wr, + fsblklen, &off); + } + } +#endif + } + } + break; + case WAPBL_WC_REVOCATIONS: + case WAPBL_WC_INODES: + break; + default: + KASSERT(0); + } +#ifdef DEBUG + wapbl_circ_advance(wr, wcn->wc_len, &saveoff); + KASSERT(off == saveoff); +#endif + } + out: + wapbl_free(scratch1); + wapbl_free(scratch2); + if (!error && mismatchcnt) + error = EFTYPE; + return error; +} +#endif + +int +wapbl_replay_write(struct wapbl_replay *wr, struct vnode *fsdevvp) +{ + off_t off; + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int logblklen = 1<wc_log_dev_bshift; + int fsblklen = 1<wc_fs_dev_bshift; + void *scratch1 = wapbl_malloc(MAXBSIZE); + int error = 0; + + KDASSERT(wapbl_replay_isopen(wr)); + + /* + * This parses the journal for replay, although it could + * just as easily walk the hashtable instead. + */ + + off = wch->wc_tail; + while (off != wch->wc_head) { + struct wapbl_wc_null *wcn; +#ifdef DEBUG + off_t saveoff = off; +#endif + error = wapbl_circ_read(wr, wr->wr_scratch, logblklen, &off); + if (error) + goto out; + wcn = (struct wapbl_wc_null *)wr->wr_scratch; + switch (wcn->wc_type) { + case WAPBL_WC_BLOCKS: + { + struct wapbl_wc_blocklist *wc = + (struct wapbl_wc_blocklist *)wr->wr_scratch; + int i; + for (i = 0; i < wc->wc_blkcount; i++) { + int j, n; + /* + * Check each physical block against + * the hashtable independently + */ + n = wc->wc_blocks[i].wc_dlen >> + wch->wc_fs_dev_bshift; + for (j = 0; j < n; j++) { + struct wapbl_blk *wb = + wapbl_blkhash_get(wr, + wc->wc_blocks[i].wc_daddr+j); + if (wb && (wb->wb_off == off)) { + error = wapbl_circ_read( + wr, scratch1, + fsblklen, &off); + if (error) + goto out; + error = + wapbl_write(scratch1, + fsblklen, fsdevvp, + wb->wb_blk); + if (error) + goto out; + } else { + wapbl_circ_advance(wr, + fsblklen, &off); + } + } + } + } + break; + case WAPBL_WC_REVOCATIONS: + case WAPBL_WC_INODES: + break; + default: + KASSERT(0); + } +#ifdef DEBUG + wapbl_circ_advance(wr, wcn->wc_len, &saveoff); + KASSERT(off == saveoff); +#endif + } + out: + wapbl_free(scratch1); + return error; +} + +int +wapbl_replay_read(struct wapbl_replay *wr, void *data, daddr_t blk, long len) +{ + struct wapbl_wc_header *wch = &wr->wr_wc_header; + int fsblklen = 1<wc_fs_dev_bshift; + + KDASSERT(wapbl_replay_isopen(wr)); + + KASSERT((len % fsblklen) == 0); + + while (len != 0) { + struct wapbl_blk *wb = wapbl_blkhash_get(wr, blk); + if (wb) { + off_t off = wb->wb_off; + int error; + error = wapbl_circ_read(wr, data, fsblklen, &off); + if (error) + return error; + } + data = (uint8_t *)data + fsblklen; + len -= fsblklen; + blk++; + } + return 0; +} Index: sys/kern/vnode_if.c =================================================================== RCS file: /cvsroot/src/sys/kern/vnode_if.c,v retrieving revision 1.76 diff -d -p -u -r1.76 vnode_if.c --- sys/kern/vnode_if.c 25 Jan 2008 14:32:46 -0000 1.76 +++ sys/kern/vnode_if.c 2 Mar 2008 10:27:29 -0000 @@ -802,6 +802,7 @@ VOP_FSYNC(struct vnode *vp, mpsafe = (vp->v_vflag & VV_MPSAFE); if (!mpsafe) { KERNEL_LOCK(1, curlwp); } error = (VCALL(vp, VOFFSET(vop_fsync), &a)); + if (!mpsafe) { KERNEL_UNLOCK_ONE(curlwp); } return error; } Index: sys/miscfs/genfs/genfs_io.c =================================================================== RCS file: /cvsroot/src/sys/miscfs/genfs/genfs_io.c,v retrieving revision 1.5 diff -d -p -u -r1.5 genfs_io.c --- sys/miscfs/genfs/genfs_io.c 18 Jan 2008 11:01:23 -0000 1.5 +++ sys/miscfs/genfs/genfs_io.c 2 Mar 2008 10:27:34 -0000 @@ -589,8 +589,22 @@ loopdone: */ if (!error && sawhole && blockalloc) { - error = GOP_ALLOC(vp, startoffset, npages << PAGE_SHIFT, 0, - cred); + /* + * XXX: This assumes that we come here only via + * the mmio path + */ + if (vp->v_mount->mnt_wapbl && write) { + error = WAPBL_BEGIN(vp->v_mount); + } + + if (!error) { + error = GOP_ALLOC(vp, startoffset, + npages << PAGE_SHIFT, 0, cred); + if (vp->v_mount->mnt_wapbl && write) { + WAPBL_END(vp->v_mount); + } + } + UVMHIST_LOG(ubchist, "gop_alloc off 0x%x/0x%x -> %d", startoffset, npages << PAGE_SHIFT, error,0); if (!error) { Index: sys/rump/fs/bin/ffs/ffs.c =================================================================== RCS file: /cvsroot/src/sys/rump/fs/bin/ffs/ffs.c,v retrieving revision 1.6 diff -d -p -u -r1.6 ffs.c --- sys/rump/fs/bin/ffs/ffs.c 7 Nov 2007 15:51:07 -0000 1.6 +++ sys/rump/fs/bin/ffs/ffs.c 2 Mar 2008 10:27:39 -0000 @@ -97,7 +97,7 @@ main(int argc, char *argv[]) memset(&args, 0, sizeof(args)); args.fspec = argv[0]; - rv = p2k_run_fs(MOUNT_FFS, argv[0], argv[1], mntflags, + rv = p2k_run_fs(MOUNT_FFS, argv[0], argv[1], mntflags | MNT_LOG, &args, sizeof(args), pflags); if (rv) err(1, "mount"); Index: sys/rump/fs/lib/libffs/Makefile =================================================================== RCS file: /cvsroot/src/sys/rump/fs/lib/libffs/Makefile,v retrieving revision 1.3 diff -d -p -u -r1.3 Makefile --- sys/rump/fs/lib/libffs/Makefile 24 Sep 2007 01:31:07 -0000 1.3 +++ sys/rump/fs/lib/libffs/Makefile 2 Mar 2008 10:27:39 -0000 @@ -9,9 +9,9 @@ LIB= ffs SRCS= ffs_alloc.c ffs_balloc.c ffs_bswap.c ffs_inode.c \ ffs_softdep.stub.c ffs_subr.c ffs_tables.c ffs_vfsops.c \ - ffs_vnops.c ffs_snapshot.c + ffs_vnops.c ffs_snapshot.c ffs_wapbl.c -CPPFLAGS+= -DFFS_NO_SNAPSHOT -DFFS_EI +CPPFLAGS+= -DFFS_NO_SNAPSHOT -DFFS_EI -DWAPBL CFLAGS+= -Wno-pointer-sign .include Index: sys/rump/fs/lib/libufs/Makefile =================================================================== RCS file: /cvsroot/src/sys/rump/fs/lib/libufs/Makefile,v retrieving revision 1.4 diff -d -p -u -r1.4 Makefile --- sys/rump/fs/lib/libufs/Makefile 24 Sep 2007 01:31:07 -0000 1.4 +++ sys/rump/fs/lib/libufs/Makefile 2 Mar 2008 10:27:39 -0000 @@ -8,9 +8,9 @@ LIB= ufs .PATH: ${NETBSDSRCDIR}/sys/ufs/ufs SRCS= ufs_bmap.c ufs_dirhash.c ufs_ihash.c ufs_inode.c ufs_lookup.c \ - ufs_vfsops.c ufs_vnops.c + ufs_vfsops.c ufs_vnops.c ufs_wapbl.c -CPPFLAGS+= -DUFS_DIRHASH -DFFS_EI +CPPFLAGS+= -DUFS_DIRHASH -DFFS_EI -DWAPBL .include .include Index: sys/rump/librump/rumpkern/Makefile =================================================================== RCS file: /cvsroot/src/sys/rump/librump/rumpkern/Makefile,v retrieving revision 1.29 diff -d -p -u -r1.29 Makefile --- sys/rump/librump/rumpkern/Makefile 28 Jan 2008 15:48:18 -0000 1.29 +++ sys/rump/librump/rumpkern/Makefile 2 Mar 2008 10:27:39 -0000 @@ -23,7 +23,7 @@ SRCS+= clock_subr.c kern_descrip.c kern_ subr_bufq.c subr_hash.c subr_prf2.c subr_specificdata.c \ subr_time.c subr_workqueue.c sys_generic.c vfs_bio.c \ vfs_cache.c vfs_getcwd.c vfs_init.c vfs_lookup.c vfs_subr.c \ - vfs_subr2.c vfs_vnops.c vnode_if.c + vfs_subr2.c vfs_vnops.c vfs_wapbl.c vnode_if.c # sys/miscfs SRCS+= genfs_vfsops.c genfs_vnops.c sync_subr.c Index: sys/rump/librump/rumpkern/rump.c =================================================================== RCS file: /cvsroot/src/sys/rump/librump/rumpkern/rump.c,v retrieving revision 1.35 diff -d -p -u -r1.35 rump.c --- sys/rump/librump/rumpkern/rump.c 30 Jan 2008 14:57:24 -0000 1.35 +++ sys/rump/librump/rumpkern/rump.c 2 Mar 2008 10:27:39 -0000 @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -117,6 +118,7 @@ rump_init() bufinit(); filedesc_init(); selsysinit(); + wapbl_init(); rumpvfs_init(); Index: sys/rump/librump/rumpuser/Makefile =================================================================== RCS file: /cvsroot/src/sys/rump/librump/rumpuser/Makefile,v retrieving revision 1.5 diff -d -p -u -r1.5 Makefile --- sys/rump/librump/rumpuser/Makefile 24 Jan 2008 22:41:08 -0000 1.5 +++ sys/rump/librump/rumpuser/Makefile 2 Mar 2008 10:27:40 -0000 @@ -7,7 +7,7 @@ LIB= rumpuser SRCS= rumpuser.c rumpuser_pth.c -#CPPFLAGS+= -DRUMP_WITHOUT_THREADS +CPPFLAGS+= -DRUMP_WITHOUT_THREADS RUMPKERNEL= no Index: sys/sys/Makefile =================================================================== RCS file: /cvsroot/src/sys/sys/Makefile,v retrieving revision 1.108 diff -d -p -u -r1.108 Makefile --- sys/sys/Makefile 16 Jan 2008 12:34:53 -0000 1.108 +++ sys/sys/Makefile 2 Mar 2008 10:27:40 -0000 @@ -19,12 +19,13 @@ INCS= acct.h agpio.h aio.h ansi.h ataio. joystick.h \ kcore.h kgdb.h kmem.h ksem.h ksyms.h ktrace.h \ lkm.h localedef.h lock.h lockf.h lwp.h lwpctl.h \ - malloc.h mallocvar.h mbuf.h md4.h \ - md5.h midiio.h mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \ + malloc.h mallocvar.h mbuf.h md4.h md5.h midiio.h \ + mman.h module.h mount.h mqueue.h msg.h msgbuf.h mtio.h mutex.h \ namei.h null.h \ param.h pipe.h pmc.h poll.h pool.h power.h proc.h \ protosw.h pset.h ptrace.h queue.h \ - ras.h reboot.h radioio.h resource.h resourcevar.h rmd160.h rnd.h rwlock.h \ + ras.h reboot.h radioio.h resource.h resourcevar.h rmd160.h rnd.h \ + rwlock.h \ scanio.h sched.h scsiio.h select.h selinfo.h sem.h sha1.h sha2.h \ shm.h siginfo.h signal.h signalvar.h sigtypes.h simplelock.h \ sleepq.h socket.h \ @@ -36,7 +37,7 @@ INCS= acct.h agpio.h aio.h ansi.h ataio. ttydefaults.h ttydev.h types.h \ ucontext.h ucred.h uio.h un.h unistd.h unpcb.h user.h utsname.h uuid.h \ vadvise.h verified_exec.h vmmeter.h vnode.h vnode_if.h \ - wait.h wdog.h + wait.h wapbl.h wdog.h INCSYMLINKS=\ sys/exec_elf.h /usr/include/elf.h \ Index: sys/sys/buf.h =================================================================== RCS file: /cvsroot/src/sys/sys/buf.h,v retrieving revision 1.106 diff -d -p -u -r1.106 buf.h --- sys/sys/buf.h 20 Feb 2008 17:13:29 -0000 1.106 +++ sys/sys/buf.h 2 Mar 2008 10:27:41 -0000 @@ -168,6 +168,7 @@ struct buf { LIST_ENTRY(buf) b_hash; /* c: hash chain */ LIST_ENTRY(buf) b_vnbufs; /* c: associated vnode */ TAILQ_ENTRY(buf) b_freelist; /* c: position if not active */ + LIST_ENTRY(buf) b_wapbllist; /* c: transaction buffer list */ daddr_t b_lblkno; /* c: logical block number */ int b_freelistindex;/* c: free list index (BQ_) */ u_int b_cflags; /* c: BC_* flags */ Index: sys/sys/fstypes.h =================================================================== RCS file: /cvsroot/src/sys/sys/fstypes.h,v retrieving revision 1.21 diff -d -p -u -r1.21 fstypes.h --- sys/sys/fstypes.h 10 Oct 2007 20:42:32 -0000 1.21 +++ sys/sys/fstypes.h 2 Mar 2008 10:27:42 -0000 @@ -87,7 +87,6 @@ typedef struct fhandle fhandle_t; #define __MNT_UNUSED2 0x00200000 #define __MNT_UNUSED3 0x00800000 #define __MNT_UNUSED4 0x01000000 -#define __MNT_UNUSED5 0x02000000 #define MNT_RDONLY 0x00000001 /* read only filesystem */ #define MNT_SYNCHRONOUS 0x00000002 /* file system written synchronously */ @@ -98,6 +97,7 @@ typedef struct fhandle fhandle_t; #define MNT_ASYNC 0x00000040 /* file system written asynchronously */ #define MNT_NOCOREDUMP 0x00008000 /* don't write core dumps to this FS */ #define MNT_IGNORE 0x00100000 /* don't show entry in df */ +#define MNT_LOG 0x02000000 /* Use logging */ #define MNT_NOATIME 0x04000000 /* Never update access times in fs */ #define MNT_SYMPERM 0x20000000 /* recognize symlink permission */ #define MNT_NODEVMTIME 0x40000000 /* Never update mod times for devs */ @@ -116,7 +116,8 @@ typedef struct fhandle fhandle_t; { MNT_NOATIME, 0, "noatime" }, \ { MNT_SYMPERM, 0, "symperm" }, \ { MNT_NODEVMTIME, 0, "nodevmtime" }, \ - { MNT_SOFTDEP, 0, "soft dependencies" }, + { MNT_SOFTDEP, 0, "soft dependencies" }, \ + { MNT_LOG, 0, "log" }, /* * exported mount flags. @@ -176,7 +177,8 @@ typedef struct fhandle fhandle_t; MNT_EXPUBLIC | \ MNT_LOCAL | \ MNT_QUOTA | \ - MNT_ROOTFS) + MNT_ROOTFS | \ + MNT_LOG) /* * External filesystem control flags. @@ -223,7 +225,7 @@ typedef struct fhandle fhandle_t; "\35MNT_EXPUBLIC" \ "\34MNT_EXNORESPORT" \ "\33MNT_NOATIME" \ - "\32MNT_UNUSED" \ + "\32MNT_LOG" \ "\31MNT_UNUSED" \ "\30MNT_UNUSED" \ "\27MNT_GETARGS" \ Index: sys/sys/mount.h =================================================================== RCS file: /cvsroot/src/sys/sys/mount.h,v retrieving revision 1.173 diff -d -p -u -r1.173 mount.h --- sys/sys/mount.h 30 Jan 2008 11:47:03 -0000 1.173 +++ sys/sys/mount.h 2 Mar 2008 10:27:44 -0000 @@ -120,6 +120,11 @@ struct mount { struct statvfs mnt_stat; /* cache of filesystem stats */ specificdata_reference mnt_specdataref; /* subsystem specific data */ + struct wapbl_ops + *mnt_wapbl_op; /* logging ops */ + struct wapbl *mnt_wapbl; /* log info */ + struct wapbl_replay + *mnt_wapbl_replay; /* replay support XXX: what? */ }; /* @@ -276,6 +281,45 @@ int fsname##_suspendctl(struct mount *, #define VFS_ATTACH(vfs) __link_set_add_data(vfsops, vfs) +/* + * This operations vector is so wapbl can be wrapped into a filesystem lkm. + * XXX Eventually, we want to move this functionality + * down into the filesystems themselves so that this isn't needed. + */ +struct wapbl_ops { + void (*wo_wapbl_discard)(struct wapbl *); + int (*wo_wapbl_replay_isopen)(struct wapbl_replay *); + int (*wo_wapbl_replay_read)(struct wapbl_replay *, void *, daddr_t, long); + void (*wo_wapbl_add_buf)(struct wapbl *, struct buf *); + void (*wo_wapbl_remove_buf)(struct wapbl *, struct buf *); + void (*wo_wapbl_resize_buf)(struct wapbl *, struct buf *, long, long); + int (*wo_wapbl_begin)(struct wapbl *, const char *, int); + void (*wo_wapbl_end)(struct wapbl *); + void (*wo_wapbl_junlock_assert)(struct wapbl *); + void (*wo_wapbl_biodone)(struct buf *); +}; +#define WAPBL_DISCARD(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_discard)((MP)->mnt_wapbl) +#define WAPBL_REPLAY_ISOPEN(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_replay_isopen)((MP)->mnt_wapbl_replay) +#define WAPBL_REPLAY_READ(MP, DATA, BLK, LEN) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_replay_read)((MP)->mnt_wapbl_replay, \ + (DATA), (BLK), (LEN)) +#define WAPBL_ADD_BUF(MP, BP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_add_buf)((MP)->mnt_wapbl, (BP)) +#define WAPBL_REMOVE_BUF(MP, BP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_remove_buf)((MP)->mnt_wapbl, (BP)) +#define WAPBL_RESIZE_BUF(MP, BP, OLDSZ, OLDCNT) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_resize_buf)((MP)->mnt_wapbl, (BP), \ + (OLDSZ), (OLDCNT)) +#define WAPBL_BEGIN(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_begin)((MP)->mnt_wapbl, \ + __FILE__, __LINE__) +#define WAPBL_END(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_end)((MP)->mnt_wapbl) +#define WAPBL_JUNLOCK_ASSERT(MP) \ + (*(MP)->mnt_wapbl_op->wo_wapbl_junlock_assert)((MP)->mnt_wapbl) + struct vfs_hooks { void (*vh_unmount)(struct mount *); }; Index: sys/sys/statvfs.h =================================================================== RCS file: /cvsroot/src/sys/sys/statvfs.h,v retrieving revision 1.13 diff -d -p -u -r1.13 statvfs.h --- sys/sys/statvfs.h 10 Nov 2007 07:23:10 -0000 1.13 +++ sys/sys/statvfs.h 2 Mar 2008 10:27:46 -0000 @@ -125,6 +125,7 @@ struct statvfs { #define ST_SYMPERM MNT_SYMPERM #define ST_NODEVMTIME MNT_NODEVMTIME #define ST_SOFTDEP MNT_SOFTDEP +#define ST_LOG MNT_LOG #define ST_EXRDONLY MNT_EXRDONLY #define ST_EXPORTED MNT_EXPORTED Index: sys/sys/vnode.h =================================================================== RCS file: /cvsroot/src/sys/sys/vnode.h,v retrieving revision 1.190 diff -d -p -u -r1.190 vnode.h --- sys/sys/vnode.h 5 Feb 2008 14:19:52 -0000 1.190