Index: sys/dev/raidframe/files.raidframe =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/files.raidframe,v retrieving revision 1.6 diff -u -p -r1.6 files.raidframe --- sys/dev/raidframe/files.raidframe 10 Jun 2008 12:49:16 -0000 1.6 +++ sys/dev/raidframe/files.raidframe 1 Nov 2009 21:30:39 -0000 @@ -42,6 +42,7 @@ file dev/raidframe/rf_paritylog.c raid file dev/raidframe/rf_paritylogDiskMgr.c raid file dev/raidframe/rf_paritylogging.c raid file dev/raidframe/rf_parityloggingdags.c raid +file dev/raidframe/rf_paritymap.c raid file dev/raidframe/rf_parityscan.c raid file dev/raidframe/rf_pq.c raid file dev/raidframe/rf_pqdeg.c raid Index: sys/dev/raidframe/raidframeio.h =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/raidframeio.h,v retrieving revision 1.4 diff -u -p -r1.4 raidframeio.h --- sys/dev/raidframe/raidframeio.h 28 Apr 2008 20:23:56 -0000 1.4 +++ sys/dev/raidframe/raidframeio.h 1 Nov 2009 21:30:39 -0000 @@ -125,4 +125,9 @@ #define RAIDFRAME_CHECK_PARITYREWRITE_STATUS_EXT _IOWR ('r', 33, RF_ProgressInfo_t *) #define RAIDFRAME_CHECK_COPYBACK_STATUS_EXT _IOWR ('r', 34, RF_ProgressInfo_t *) +#define RAIDFRAME_PARITYMAP_STATUS _IOR('r', 37, struct rf_pmstat) +#define RAIDFRAME_PARITYMAP_GET_DISABLE _IOR('r', 38, int) +#define RAIDFRAME_PARITYMAP_SET_DISABLE _IOW('r', 39, int) +#define RAIDFRAME_PARITYMAP_SET_PARAMS _IOW('r', 40, struct rf_pmparams) + #endif /* !_RF_RAIDFRAMEIO_H_ */ Index: sys/dev/raidframe/raidframevar.h =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/raidframevar.h,v retrieving revision 1.12 diff -u -p -r1.12 raidframevar.h --- sys/dev/raidframe/raidframevar.h 28 Apr 2008 20:23:56 -0000 1.12 +++ sys/dev/raidframe/raidframevar.h 1 Nov 2009 21:29:17 -0000 @@ -265,6 +265,9 @@ typedef struct RF_StripeLockDesc_s RF_St typedef struct RF_ThreadGroup_s RF_ThreadGroup_t; typedef struct RF_ThroughputStats_s RF_ThroughputStats_t; +struct rf_paritymap; +struct rf_paritymap_ondisk; + /* * Important assumptions regarding ordering of the states in this list * have been made!!! Before disturbing this ordering, look at code in @@ -446,7 +449,16 @@ typedef struct RF_ComponentLabel_s { u_int partitionSize; /* number of blocks on this *partition*. Must exactly match the partition size from the disklabel. */ - int future_use[33]; /* Future expansion */ + /* Parity map stuff. */ + int parity_map_modcount; /* If equal to mod_counter, then the last + kernel to touch this label was + parity-map-enabled. */ + u_int parity_map_flags; /* See top of rf_paritymap.h */ + int parity_map_tickms; /* Length of parity map cooldown ticks. */ + int parity_map_ntick; /* Number of parity map cooldown ticks. */ + u_int parity_map_regions; /* Number of parity map regions. */ + int future_use[28]; /* Future expansion */ + int autoconfigure; /* automatically configure this RAID set. 0 == no, 1 == yes */ int root_partition; /* Use this set as / @@ -569,4 +581,28 @@ typedef struct RF_LayoutSW_s { } RF_LayoutSW_t; #endif + +/* Parity map declarations. */ +#define RF_PARITYMAP_NREG 4096 +#define RF_PARITYMAP_NBYTE howmany(RF_PARITYMAP_NREG, NBBY) + +struct rf_pmctrs { + uint64_t nwrite, ncachesync, nclearing; +}; + +struct rf_pmparams { + int cooldown, tickms; + u_int regions; +}; + +struct rf_pmstat { + int enabled; /* if not set, rest of struct is zeroed */ + struct rf_pmparams params; + daddr_t region_size; + char dirty[RF_PARITYMAP_NBYTE]; + struct rf_pmctrs ctrs; +}; + + + #endif /* !_RF_RAIDFRAMEVAR_H_ */ Index: sys/dev/raidframe/rf_copyback.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_copyback.c,v retrieving revision 1.41 diff -u -p -r1.41 rf_copyback.c --- sys/dev/raidframe/rf_copyback.c 26 Jan 2008 20:44:37 -0000 1.41 +++ sys/dev/raidframe/rf_copyback.c 1 Nov 2009 21:29:17 -0000 @@ -86,7 +86,7 @@ rf_ConfigureCopyback(RF_ShutdownList_t * void rf_CopybackReconstructedData(RF_Raid_t *raidPtr) { - RF_ComponentLabel_t c_label; + RF_ComponentLabel_t *c_label; int found, retcode; RF_CopybackDesc_t *desc; RF_RowCol_t fcol; @@ -206,19 +206,17 @@ rf_CopybackReconstructedData(RF_Raid_t * /* Data has been restored. Fix up the component label. */ /* Don't actually need the read here.. */ - raidread_component_label( raidPtr->raid_cinfo[fcol].ci_dev, - raidPtr->raid_cinfo[fcol].ci_vp, - &c_label); - - raid_init_component_label( raidPtr, &c_label ); - - c_label.row = 0; - c_label.column = fcol; - c_label.partitionSize = raidPtr->Disks[fcol].partitionSize; - - raidwrite_component_label( raidPtr->raid_cinfo[fcol].ci_dev, - raidPtr->raid_cinfo[fcol].ci_vp, - &c_label); + + c_label = raidget_component_label(raidPtr, fcol); + raid_init_component_label(raidPtr, c_label); + + c_label->row = 0; + c_label->column = fcol; + c_label->partitionSize = raidPtr->Disks[fcol].partitionSize; + + raidflush_component_label(raidPtr, fcol); + + /* XXXjld why is this here? */ rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE); } Index: sys/dev/raidframe/rf_disks.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_disks.c,v retrieving revision 1.70.10.1 diff -u -p -r1.70.10.1 rf_disks.c --- sys/dev/raidframe/rf_disks.c 4 Apr 2009 17:15:14 -0000 1.70.10.1 +++ sys/dev/raidframe/rf_disks.c 1 Nov 2009 21:30:39 -0000 @@ -132,10 +132,7 @@ rf_ConfigureDisks(RF_ShutdownList_t **li goto fail; if (disks[c].status == rf_ds_optimal) { - raidread_component_label( - raidPtr->raid_cinfo[c].ci_dev, - raidPtr->raid_cinfo[c].ci_vp, - &raidPtr->raid_cinfo[c].ci_label); + raidfetch_component_label(raidPtr, c); } if (disks[c].status != rf_ds_optimal) { @@ -461,7 +458,7 @@ rf_AutoConfigureDisks(RF_Raid_t *raidPtr raidPtr->raid_cinfo[c].ci_vp = ac->vp; raidPtr->raid_cinfo[c].ci_dev = ac->dev; - memcpy(&raidPtr->raid_cinfo[c].ci_label, + memcpy(raidget_component_label(raidPtr, c), ac->clabel, sizeof(*ac->clabel)); snprintf(diskPtr->devname, sizeof(diskPtr->devname), "/dev/%s", ac->devname); @@ -731,7 +728,7 @@ rf_CheckLabels(RF_Raid_t *raidPtr, RF_Co num_mod = 0; for (c = 0; c < raidPtr->numCol; c++) { - ci_label = &raidPtr->raid_cinfo[c].ci_label; + ci_label = raidget_component_label(raidPtr, c); found=0; for(i=0;iserial_number) { @@ -786,7 +783,7 @@ rf_CheckLabels(RF_Raid_t *raidPtr, RF_Co } for (c = 0; c < raidPtr->numCol; c++) { - ci_label = &raidPtr->raid_cinfo[c].ci_label; + ci_label = raidget_component_label(raidPtr, c); if (serial_number != ci_label->serial_number) { hosed_column = c; break; @@ -841,7 +838,7 @@ rf_CheckLabels(RF_Raid_t *raidPtr, RF_Co } for (c = 0; c < raidPtr->numCol; c++) { - ci_label = &raidPtr->raid_cinfo[c].ci_label; + ci_label = raidget_component_label(raidPtr, c); if (mod_number != ci_label->mod_counter) { if (hosed_column == c) { /* same one. Can @@ -908,7 +905,7 @@ rf_CheckLabels(RF_Raid_t *raidPtr, RF_Co for (c = 0; c < raidPtr->numCol; c++) { dev_name = &cfgPtr->devnames[0][c][0]; - ci_label = &raidPtr->raid_cinfo[c].ci_label; + ci_label = raidget_component_label(raidPtr, c); if (c == hosed_column) { printf("raid%d: Ignoring %s\n", Index: sys/dev/raidframe/rf_driver.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_driver.c,v retrieving revision 1.118.10.1 diff -u -p -r1.118.10.1 rf_driver.c --- sys/dev/raidframe/rf_driver.c 23 Dec 2008 04:03:00 -0000 1.118.10.1 +++ sys/dev/raidframe/rf_driver.c 1 Nov 2009 21:30:39 -0000 @@ -107,6 +107,7 @@ __KERNEL_RCSID(0, "$NetBSD: rf_driver.c, #include "rf_options.h" #include "rf_shutdown.h" #include "rf_kintf.h" +#include "rf_paritymap.h" #include @@ -239,6 +240,9 @@ rf_Shutdown(RF_Raid_t *raidPtr) raidPtr->valid = 0; + if (raidPtr->parity_map != NULL) + rf_paritymap_detach(raidPtr); + rf_update_component_labels(raidPtr, RF_FINAL_COMPONENT_UPDATE); rf_UnconfigureVnodes(raidPtr); @@ -414,6 +418,11 @@ rf_Configure(RF_Raid_t *raidPtr, RF_Conf return(rc); } + /* Set up parity map stuff, if applicable. */ +#ifndef RF_NO_PARITY_MAP + rf_paritymap_attach(raidPtr, cfgPtr->force); +#endif + raidPtr->valid = 1; printf("raid%d: %s\n", raidPtr->raidid, @@ -674,6 +683,11 @@ rf_DoAccess(RF_Raid_t * raidPtr, RF_IoTy #endif desc->async_flag = async_flag; + if (raidPtr->parity_map != NULL && + type == RF_IO_TYPE_WRITE) + rf_paritymap_begin(raidPtr->parity_map, raidAddress, + numBlocks); + rf_ContinueRaidAccess(desc); return (0); Index: sys/dev/raidframe/rf_engine.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_engine.c,v retrieving revision 1.39 diff -u -p -r1.39 rf_engine.c --- sys/dev/raidframe/rf_engine.c 16 Nov 2006 01:33:23 -0000 1.39 +++ sys/dev/raidframe/rf_engine.c 1 Nov 2009 21:29:17 -0000 @@ -68,6 +68,7 @@ __KERNEL_RCSID(0, "$NetBSD: rf_engine.c, #include "rf_shutdown.h" #include "rf_raid.h" #include "rf_kintf.h" +#include "rf_paritymap.h" static void rf_ShutdownEngine(void *); static void DAGExecutionThread(RF_ThreadArg_t arg); @@ -855,6 +856,13 @@ rf_RaidIOThread(RF_ThreadArg_t arg) &(raidPtr->iodone_lock)); } + /* Check for deferred parity-map-related work. */ + if (raidPtr->parity_map != NULL) { + simple_unlock(&(raidPtr->iodone_lock)); + rf_paritymap_checkwork(raidPtr->parity_map); + simple_lock(&(raidPtr->iodone_lock)); + } + /* See what I/Os, if any, have arrived */ while ((req = TAILQ_FIRST(&(raidPtr->iodone))) != NULL) { TAILQ_REMOVE(&(raidPtr->iodone), req, iodone_entries); Index: sys/dev/raidframe/rf_kintf.h =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_kintf.h,v retrieving revision 1.20 diff -u -p -r1.20 rf_kintf.h --- sys/dev/raidframe/rf_kintf.h 27 Aug 2006 05:07:12 -0000 1.20 +++ sys/dev/raidframe/rf_kintf.h 1 Nov 2009 21:29:17 -0000 @@ -41,18 +41,24 @@ int rf_GetSpareTableFromDaemon(RF_Sp void raidstart(RF_Raid_t * raidPtr); int rf_DispatchKernelIO(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req); -int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); -int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +int raidfetch_component_label(RF_Raid_t *, RF_RowCol_t); +RF_ComponentLabel_t *raidget_component_label(RF_Raid_t *, RF_RowCol_t); +int raidflush_component_label(RF_Raid_t *, RF_RowCol_t); + +void rf_paritymap_kern_write(RF_Raid_t *, struct rf_paritymap_ondisk *); +void rf_paritymap_kern_read(RF_Raid_t *, struct rf_paritymap_ondisk *); #define RF_NORMAL_COMPONENT_UPDATE 0 #define RF_FINAL_COMPONENT_UPDATE 1 void rf_update_component_labels(RF_Raid_t *, int); -int raidmarkclean(dev_t dev, struct vnode *b_vp, int); -int raidmarkdirty(dev_t dev, struct vnode *b_vp, int); +int raidmarkclean(RF_Raid_t *, RF_RowCol_t); +int raidmarkdirty(RF_Raid_t *, RF_RowCol_t); void raid_init_component_label(RF_Raid_t *, RF_ComponentLabel_t *); void rf_print_component_label(RF_ComponentLabel_t *); void rf_UnconfigureVnodes( RF_Raid_t * ); void rf_close_component( RF_Raid_t *, struct vnode *, int); void rf_disk_unbusy(RF_RaidAccessDesc_t *); int rf_getdisksize(struct vnode *, struct lwp *, RF_RaidDisk_t *); +int rf_sync_component_caches(RF_Raid_t *raidPtr); #endif /* _RF__RF_KINTF_H_ */ + Index: sys/dev/raidframe/rf_netbsdkintf.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_netbsdkintf.c,v retrieving revision 1.250.4.4 diff -u -p -r1.250.4.4 rf_netbsdkintf.c --- sys/dev/raidframe/rf_netbsdkintf.c 4 Apr 2009 17:15:14 -0000 1.250.4.4 +++ sys/dev/raidframe/rf_netbsdkintf.c 3 Nov 2009 17:34:32 -0000 @@ -165,6 +165,7 @@ __KERNEL_RCSID(0, "$NetBSD: rf_netbsdkin #include #include +#include #include "raid.h" #include "opt_raid_autoconfig.h" #include "rf_raid.h" @@ -213,6 +214,17 @@ static int raid_match(struct device *, s static void raid_attach(struct device *, struct device *, void *); static int raid_detach(struct device *, int); +static int raidread_component_area(dev_t, struct vnode *, void *, size_t, + daddr_t, daddr_t); +static int raidwrite_component_area(dev_t, struct vnode *, void *, size_t, + daddr_t, daddr_t, int); + +static int raidwrite_component_label(dev_t, struct vnode *, + RF_ComponentLabel_t *); +static int raidread_component_label(dev_t, struct vnode *, + RF_ComponentLabel_t *); + + dev_type_open(raidopen); dev_type_close(raidclose); dev_type_read(raidread); @@ -317,7 +329,6 @@ void rf_release_all_vps(RF_ConfigSet_t * void rf_cleanup_config_set(RF_ConfigSet_t *); int rf_have_enough_components(RF_ConfigSet_t *); int rf_auto_config_set(RF_ConfigSet_t *, int *); -static int rf_sync_component_caches(RF_Raid_t *raidPtr); static int raidautoconfig = 0; /* Debugging, mostly. Set to 0 to not allow autoconfig to take place. @@ -981,7 +992,7 @@ raidioctl(dev_t dev, u_long cmd, void *d u_char *specific_buf; int retcode = 0; int column; - int raidid; +/* int raidid; */ struct rf_recon_req *rrcopy, *rr; RF_ComponentLabel_t *clabel; RF_ComponentLabel_t *ci_label; @@ -1071,6 +1082,10 @@ raidioctl(dev_t dev, u_long cmd, void *d case RAIDFRAME_SET_ROOT: case RAIDFRAME_DELETE_COMPONENT: case RAIDFRAME_INCORPORATE_HOT_SPARE: + case RAIDFRAME_PARITYMAP_STATUS: + case RAIDFRAME_PARITYMAP_GET_DISABLE: + case RAIDFRAME_PARITYMAP_SET_DISABLE: + case RAIDFRAME_PARITYMAP_SET_PARAMS: if ((rs->sc_flags & RAIDF_INITED) == 0) return (ENXIO); } @@ -1206,18 +1221,16 @@ raidioctl(dev_t dev, u_long cmd, void *d /* need to read the component label for the disk indicated by row,column in clabel */ - /* For practice, let's get it directly fromdisk, rather - than from the in-core copy */ - RF_Malloc( clabel, sizeof( RF_ComponentLabel_t ), - (RF_ComponentLabel_t *)); - if (clabel == NULL) - return (ENOMEM); + /* + * Perhaps there should be an option to skip the in-core + * copy and hit the disk, as with disklabel(8). + */ + RF_Malloc(clabel, sizeof(*clabel), (RF_ComponentLabel_t *)); retcode = copyin( *clabel_ptr, clabel, sizeof(RF_ComponentLabel_t)); if (retcode) { - RF_Free( clabel, sizeof(RF_ComponentLabel_t)); return(retcode); } @@ -1227,21 +1240,20 @@ raidioctl(dev_t dev, u_long cmd, void *d if ((column < 0) || (column >= raidPtr->numCol + raidPtr->numSpare)) { - RF_Free( clabel, sizeof(RF_ComponentLabel_t)); return(EINVAL); } - retcode = raidread_component_label(raidPtr->Disks[column].dev, - raidPtr->raid_cinfo[column].ci_vp, - clabel ); + RF_Free(clabel, sizeof(*clabel)); + + clabel = raidget_component_label(raidPtr, column); if (retcode == 0) { retcode = copyout(clabel, *clabel_ptr, sizeof(RF_ComponentLabel_t)); } - RF_Free(clabel, sizeof(RF_ComponentLabel_t)); return (retcode); +#if 0 case RAIDFRAME_SET_COMPONENT_LABEL: clabel = (RF_ComponentLabel_t *) data; @@ -1273,13 +1285,11 @@ raidioctl(dev_t dev, u_long cmd, void *d /* XXX and before it is, we need to fill in the rest of the fields!?!?!?! */ -#if 0 - raidwrite_component_label( - raidPtr->Disks[column].dev, - raidPtr->raid_cinfo[column].ci_vp, - clabel ); -#endif + memcpy(raidget_component_label(raidPtr, column), + clabel, sizeof(*clabel)); + raidflush_component_label(raidPtr, column); return (0); +#endif case RAIDFRAME_INIT_LABELS: clabel = (RF_ComponentLabel_t *) data; @@ -1292,27 +1302,24 @@ raidioctl(dev_t dev, u_long cmd, void *d raidPtr->serial_number = clabel->serial_number; - RF_Malloc(ci_label, sizeof(RF_ComponentLabel_t), - (RF_ComponentLabel_t *)); - if (ci_label == NULL) - return (ENOMEM); - - raid_init_component_label(raidPtr, ci_label); - ci_label->serial_number = clabel->serial_number; - ci_label->row = 0; /* we dont' pretend to support more */ - for(column=0;columnnumCol;column++) { diskPtr = &raidPtr->Disks[column]; if (!RF_DEAD_DISK(diskPtr->status)) { - ci_label->partitionSize = diskPtr->partitionSize; + ci_label = raidget_component_label(raidPtr, + column); + /* Zeroing this is important. */ + memset(ci_label, 0, sizeof(*ci_label)); + raid_init_component_label(raidPtr, ci_label); + ci_label->serial_number = + raidPtr->serial_number; + ci_label->row = 0; /* we dont' pretend to support more */ + ci_label->partitionSize = + diskPtr->partitionSize; ci_label->column = column; - raidwrite_component_label( - raidPtr->Disks[column].dev, - raidPtr->raid_cinfo[column].ci_vp, - ci_label ); + raidflush_component_label(raidPtr, column); } + /* XXXjld what about the spares? */ } - RF_Free(ci_label, sizeof(RF_ComponentLabel_t)); return (retcode); case RAIDFRAME_SET_AUTOCONFIG: @@ -1473,6 +1480,28 @@ raidioctl(dev_t dev, u_long cmd, void *d *(int *) data = raidPtr->parity_good; return (0); + case RAIDFRAME_PARITYMAP_STATUS: + rf_paritymap_status(raidPtr->parity_map, + (struct rf_pmstat *)data); + return 0; + + case RAIDFRAME_PARITYMAP_SET_PARAMS: + if (raidPtr->parity_map == NULL) + return ENOENT; /* ??? */ + if (0 != rf_paritymap_set_params(raidPtr->parity_map, + (struct rf_pmparams *)data, 1)) + return EINVAL; + return 0; + + case RAIDFRAME_PARITYMAP_GET_DISABLE: + *(int *) data = rf_paritymap_get_disable(raidPtr); + return 0; + + case RAIDFRAME_PARITYMAP_SET_DISABLE: + rf_paritymap_set_disable(raidPtr, *(int *)data); + /* XXX should errors be passed up? */ + return 0; + case RAIDFRAME_RESET_ACCTOTALS: memset(&raidPtr->acc_totals, 0, sizeof(raidPtr->acc_totals)); return (0); @@ -2380,34 +2409,75 @@ raidunlock(struct raid_softc *rs) #define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ #define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ +#define RF_PARITY_MAP_OFFSET \ + (RF_COMPONENT_INFO_OFFSET + RF_COMPONENT_INFO_SIZE) +#define RF_PARITY_MAP_SIZE RF_PARITYMAP_NBYTE int -raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter) +raidmarkclean(RF_Raid_t *raidPtr, RF_RowCol_t col) { - RF_ComponentLabel_t clabel; - raidread_component_label(dev, b_vp, &clabel); - clabel.mod_counter = mod_counter; - clabel.clean = RF_RAID_CLEAN; - raidwrite_component_label(dev, b_vp, &clabel); + RF_ComponentLabel_t *clabel; + + clabel = raidget_component_label(raidPtr, col); + clabel->clean = RF_RAID_CLEAN; + raidflush_component_label(raidPtr, col); return(0); } int -raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter) +raidmarkdirty(RF_Raid_t *raidPtr, RF_RowCol_t col) { - RF_ComponentLabel_t clabel; - raidread_component_label(dev, b_vp, &clabel); - clabel.mod_counter = mod_counter; - clabel.clean = RF_RAID_DIRTY; - raidwrite_component_label(dev, b_vp, &clabel); + RF_ComponentLabel_t *clabel; + + clabel = raidget_component_label(raidPtr, col); + clabel->clean = RF_RAID_DIRTY; + raidflush_component_label(raidPtr, col); return(0); } -/* ARGSUSED */ int +raidfetch_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) +{ + return raidread_component_label(raidPtr->Disks[col].dev, + raidPtr->raid_cinfo[col].ci_vp, + &raidPtr->raid_cinfo[col].ci_label); +} + +RF_ComponentLabel_t * +raidget_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) +{ + return &raidPtr->raid_cinfo[col].ci_label; +} + +int +raidflush_component_label(RF_Raid_t *raidPtr, RF_RowCol_t col) +{ + RF_ComponentLabel_t *label; + + label = &raidPtr->raid_cinfo[col].ci_label; + label->mod_counter = raidPtr->mod_counter; +#ifndef RF_NO_PARITY_MAP + label->parity_map_modcount = label->mod_counter; +#endif + return raidwrite_component_label(raidPtr->Disks[col].dev, + raidPtr->raid_cinfo[col].ci_vp, label); +} + + +static int raidread_component_label(dev_t dev, struct vnode *b_vp, - RF_ComponentLabel_t *clabel) + RF_ComponentLabel_t *clabel) +{ + return raidread_component_area(dev, b_vp, clabel, + sizeof(RF_ComponentLabel_t), + RF_COMPONENT_INFO_OFFSET, RF_COMPONENT_INFO_SIZE); +} + +/* ARGSUSED */ +static int +raidread_component_area(dev_t dev, struct vnode *b_vp, void *data, + size_t msize, daddr_t offset, daddr_t dsize) { struct buf *bp; const struct bdevsw *bdev; @@ -2423,14 +2493,14 @@ raidread_component_label(dev_t dev, stru } /* get a block of the appropriate size... */ - bp = geteblk((int)RF_COMPONENT_INFO_SIZE); + bp = geteblk((int)dsize); bp->b_dev = dev; /* get our ducks in a row for the read */ - bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE; - bp->b_bcount = RF_COMPONENT_INFO_SIZE; + bp->b_blkno = offset / DEV_BSIZE; + bp->b_bcount = dsize; bp->b_flags |= B_READ; - bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE; + bp->b_resid = dsize; bdev = bdevsw_lookup(bp->b_dev); if (bdev == NULL) @@ -2440,40 +2510,51 @@ raidread_component_label(dev_t dev, stru error = biowait(bp); if (!error) { - memcpy(clabel, bp->b_data, - sizeof(RF_ComponentLabel_t)); + memcpy(data, bp->b_data, msize); } brelse(bp, 0); return(error); } -/* ARGSUSED */ -int + + +static int raidwrite_component_label(dev_t dev, struct vnode *b_vp, - RF_ComponentLabel_t *clabel) + RF_ComponentLabel_t *clabel) +{ + return raidwrite_component_area(dev, b_vp, clabel, + sizeof(RF_ComponentLabel_t), + RF_COMPONENT_INFO_OFFSET, RF_COMPONENT_INFO_SIZE, 0); +} + +/* ARGSUSED */ +static int +raidwrite_component_area(dev_t dev, struct vnode *b_vp, void *data, + size_t msize, daddr_t offset, daddr_t dsize, int asyncp) { struct buf *bp; const struct bdevsw *bdev; int error; /* get a block of the appropriate size... */ - bp = geteblk((int)RF_COMPONENT_INFO_SIZE); + bp = geteblk((int)dsize); bp->b_dev = dev; /* get our ducks in a row for the write */ - bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE; - bp->b_bcount = RF_COMPONENT_INFO_SIZE; - bp->b_flags |= B_WRITE; - bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE; + bp->b_blkno = offset / DEV_BSIZE; + bp->b_bcount = dsize; + bp->b_flags |= B_WRITE | (asyncp ? B_ASYNC : 0); + bp->b_resid = dsize; - memset(bp->b_data, 0, RF_COMPONENT_INFO_SIZE ); - - memcpy(bp->b_data, clabel, sizeof(RF_ComponentLabel_t)); + memset(bp->b_data, 0, dsize); + memcpy(bp->b_data, data, msize); bdev = bdevsw_lookup(bp->b_dev); if (bdev == NULL) return (ENXIO); (*bdev->d_strategy)(bp); + if (asyncp) + return 0; error = biowait(bp); brelse(bp, 0); if (error) { @@ -2486,9 +2567,48 @@ raidwrite_component_label(dev_t dev, str } void +rf_paritymap_kern_write(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) +{ + int c; + + for (c = 0; c < raidPtr->numCol; c++) { + /* Skip dead disks. */ + if (RF_DEAD_DISK(raidPtr->Disks[c].status)) + continue; + /* XXXjld: what if an error occurs here? */ + raidwrite_component_area(raidPtr->Disks[c].dev, + raidPtr->raid_cinfo[c].ci_vp, map, + RF_PARITYMAP_NBYTE, + RF_PARITY_MAP_OFFSET, RF_PARITY_MAP_SIZE, 0); + } +} + +void +rf_paritymap_kern_read(RF_Raid_t *raidPtr, struct rf_paritymap_ondisk *map) +{ + struct rf_paritymap_ondisk tmp; + int c; + + for (c = 0; c < raidPtr->numCol; c++) { + /* Skip dead disks. */ + if (RF_DEAD_DISK(raidPtr->Disks[c].status)) + continue; + raidread_component_area(raidPtr->Disks[c].dev, + raidPtr->raid_cinfo[c].ci_vp, &tmp, + RF_PARITYMAP_NBYTE, + RF_PARITY_MAP_OFFSET, RF_PARITY_MAP_SIZE); + if (c == 0) { + memcpy(map, &tmp, sizeof(*map)); + } else { + rf_paritymap_merge(map, &tmp); + } + } +} + +void rf_markalldirty(RF_Raid_t *raidPtr) { - RF_ComponentLabel_t clabel; + RF_ComponentLabel_t *clabel; int sparecol; int c; int j; @@ -2499,19 +2619,13 @@ rf_markalldirty(RF_Raid_t *raidPtr) /* we don't want to touch (at all) a disk that has failed */ if (!RF_DEAD_DISK(raidPtr->Disks[c].status)) { - raidread_component_label( - raidPtr->Disks[c].dev, - raidPtr->raid_cinfo[c].ci_vp, - &clabel); - if (clabel.status == rf_ds_spared) { + clabel = raidget_component_label(raidPtr, c); + if (clabel->status == rf_ds_spared) { /* XXX do something special... but whatever you do, don't try to access it!! */ } else { - raidmarkdirty( - raidPtr->Disks[c].dev, - raidPtr->raid_cinfo[c].ci_vp, - raidPtr->mod_counter); + raidmarkdirty(raidPtr, c); } } } @@ -2535,23 +2649,18 @@ rf_markalldirty(RF_Raid_t *raidPtr) } } - raidread_component_label( - raidPtr->Disks[sparecol].dev, - raidPtr->raid_cinfo[sparecol].ci_vp, - &clabel); + clabel = raidget_component_label(raidPtr, sparecol); /* make sure status is noted */ - raid_init_component_label(raidPtr, &clabel); + raid_init_component_label(raidPtr, clabel); - clabel.row = 0; - clabel.column = scol; + clabel->row = 0; + clabel->column = scol; /* Note: we *don't* change status from rf_ds_used_spare to rf_ds_optimal */ /* clabel.status = rf_ds_optimal; */ - raidmarkdirty(raidPtr->Disks[sparecol].dev, - raidPtr->raid_cinfo[sparecol].ci_vp, - raidPtr->mod_counter); + raidmarkdirty(raidPtr, sparecol); } } } @@ -2560,7 +2669,7 @@ rf_markalldirty(RF_Raid_t *raidPtr) void rf_update_component_labels(RF_Raid_t *raidPtr, int final) { - RF_ComponentLabel_t clabel; + RF_ComponentLabel_t *clabel; int sparecol; int c; int j; @@ -2575,29 +2684,17 @@ rf_update_component_labels(RF_Raid_t *ra for (c = 0; c < raidPtr->numCol; c++) { if (raidPtr->Disks[c].status == rf_ds_optimal) { - raidread_component_label( - raidPtr->Disks[c].dev, - raidPtr->raid_cinfo[c].ci_vp, - &clabel); + clabel = raidget_component_label(raidPtr, c); /* make sure status is noted */ - clabel.status = rf_ds_optimal; - - /* bump the counter */ - clabel.mod_counter = raidPtr->mod_counter; + clabel->status = rf_ds_optimal; /* note what unit we are configured as */ - clabel.last_unit = raidPtr->raidid; + clabel->last_unit = raidPtr->raidid; - raidwrite_component_label( - raidPtr->Disks[c].dev, - raidPtr->raid_cinfo[c].ci_vp, - &clabel); + raidflush_component_label(raidPtr, c); if (final == RF_FINAL_COMPONENT_UPDATE) { if (raidPtr->parity_good == RF_RAID_CLEAN) { - raidmarkclean( - raidPtr->Disks[c].dev, - raidPtr->raid_cinfo[c].ci_vp, - raidPtr->mod_counter); + raidmarkclean(raidPtr, c); } } } @@ -2625,28 +2722,19 @@ rf_update_component_labels(RF_Raid_t *ra } /* XXX shouldn't *really* need this... */ - raidread_component_label( - raidPtr->Disks[sparecol].dev, - raidPtr->raid_cinfo[sparecol].ci_vp, - &clabel); + clabel = raidget_component_label(raidPtr, sparecol); /* make sure status is noted */ - raid_init_component_label(raidPtr, &clabel); + raid_init_component_label(raidPtr, clabel); + + clabel->column = scol; + clabel->status = rf_ds_optimal; + clabel->last_unit = raidPtr->raidid; - clabel.mod_counter = raidPtr->mod_counter; - clabel.column = scol; - clabel.status = rf_ds_optimal; - clabel.last_unit = raidPtr->raidid; - - raidwrite_component_label( - raidPtr->Disks[sparecol].dev, - raidPtr->raid_cinfo[sparecol].ci_vp, - &clabel); + raidflush_component_label(raidPtr, sparecol); if (final == RF_FINAL_COMPONENT_UPDATE) { if (raidPtr->parity_good == RF_RAID_CLEAN) { - raidmarkclean( raidPtr->Disks[sparecol].dev, - raidPtr->raid_cinfo[sparecol].ci_vp, - raidPtr->mod_counter); + raidmarkclean(raidPtr, sparecol); } } } @@ -3301,9 +3389,7 @@ rf_create_configuration(RF_AutoConfig_t int rf_set_autoconfig(RF_Raid_t *raidPtr, int new_value) { - RF_ComponentLabel_t clabel; - struct vnode *vp; - dev_t dev; + RF_ComponentLabel_t *clabel; int column; int sparecol; @@ -3311,21 +3397,17 @@ rf_set_autoconfig(RF_Raid_t *raidPtr, in for(column=0; columnnumCol; column++) { if (raidPtr->Disks[column].status == rf_ds_optimal) { - dev = raidPtr->Disks[column].dev; - vp = raidPtr->raid_cinfo[column].ci_vp; - raidread_component_label(dev, vp, &clabel); - clabel.autoconfigure = new_value; - raidwrite_component_label(dev, vp, &clabel); + clabel = raidget_component_label(raidPtr, column); + clabel->autoconfigure = new_value; + raidflush_component_label(raidPtr, column); } } for(column = 0; column < raidPtr->numSpare ; column++) { sparecol = raidPtr->numCol + column; if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { - dev = raidPtr->Disks[sparecol].dev; - vp = raidPtr->raid_cinfo[sparecol].ci_vp; - raidread_component_label(dev, vp, &clabel); - clabel.autoconfigure = new_value; - raidwrite_component_label(dev, vp, &clabel); + clabel = raidget_component_label(raidPtr, sparecol); + clabel->autoconfigure = new_value; + raidflush_component_label(raidPtr, sparecol); } } return(new_value); @@ -3334,30 +3416,24 @@ rf_set_autoconfig(RF_Raid_t *raidPtr, in int rf_set_rootpartition(RF_Raid_t *raidPtr, int new_value) { - RF_ComponentLabel_t clabel; - struct vnode *vp; - dev_t dev; + RF_ComponentLabel_t *clabel; int column; int sparecol; raidPtr->root_partition = new_value; for(column=0; columnnumCol; column++) { if (raidPtr->Disks[column].status == rf_ds_optimal) { - dev = raidPtr->Disks[column].dev; - vp = raidPtr->raid_cinfo[column].ci_vp; - raidread_component_label(dev, vp, &clabel); - clabel.root_partition = new_value; - raidwrite_component_label(dev, vp, &clabel); + clabel = raidget_component_label(raidPtr, column); + clabel->root_partition = new_value; + raidflush_component_label(raidPtr, column); } } for(column = 0; column < raidPtr->numSpare ; column++) { sparecol = raidPtr->numCol + column; if (raidPtr->Disks[sparecol].status == rf_ds_used_spare) { - dev = raidPtr->Disks[sparecol].dev; - vp = raidPtr->raid_cinfo[sparecol].ci_vp; - raidread_component_label(dev, vp, &clabel); - clabel.root_partition = new_value; - raidwrite_component_label(dev, vp, &clabel); + clabel = raidget_component_label(raidPtr, sparecol); + clabel->root_partition = new_value; + raidflush_component_label(raidPtr, sparecol); } } return(new_value); @@ -3410,6 +3486,7 @@ raid_init_component_label(RF_Raid_t *rai clabel->version = RF_COMPONENT_LABEL_VERSION; clabel->serial_number = raidPtr->serial_number; clabel->mod_counter = raidPtr->mod_counter; + clabel->num_rows = 1; clabel->num_columns = raidPtr->numCol; clabel->clean = RF_RAID_DIRTY; /* not clean */ @@ -3429,6 +3506,10 @@ raid_init_component_label(RF_Raid_t *rai clabel->root_partition = raidPtr->root_partition; clabel->last_unit = raidPtr->raidid; clabel->config_order = raidPtr->config_order; + +#ifndef RF_NO_PARITY_MAP + rf_paritymap_init_label(raidPtr->parity_map, clabel); +#endif } int @@ -3659,7 +3740,7 @@ rf_set_properties(struct raid_softc *rs, * that fails. */ -static int +int rf_sync_component_caches(RF_Raid_t *raidPtr) { int c, sparecol; Index: sys/dev/raidframe/rf_paritymap.c =================================================================== RCS file: sys/dev/raidframe/rf_paritymap.c diff -N sys/dev/raidframe/rf_paritymap.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/raidframe/rf_paritymap.c 1 Nov 2009 21:29:17 -0000 @@ -0,0 +1,750 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2009 Jed Davis. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__KERNEL_RCSID(0, "$NetBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Important parameters: */ +#define REGION_MINSIZE (25ULL << 20) +#define DFL_TICKMS 40000 +#define DFL_COOLDOWN 8 /* 7-8 intervals of 40s = 5min +/- 20s */ + +/* Internal-use flag bits. */ +#define TICKING 1 +#define TICKED 2 + +/* Prototypes! */ +static void rf_paritymap_write_locked(struct rf_paritymap *); +static void rf_paritymap_tick(void *); +static u_int rf_paritymap_nreg(RF_Raid_t *); + +/* Extract the current status of the parity map. */ +void +rf_paritymap_status(struct rf_paritymap *pm, struct rf_pmstat *ps) +{ + memset(ps, 0, sizeof(*ps)); + if (pm == NULL) + ps->enabled = 0; + else { + ps->enabled = 1; + ps->region_size = pm->region_size; + mutex_enter(&pm->lock); + memcpy(&ps->params, &pm->params, sizeof(ps->params)); + memcpy(ps->dirty, pm->disk_now, sizeof(ps->dirty)); + memcpy(&ps->ctrs, &pm->ctrs, sizeof(ps->ctrs)); + mutex_exit(&pm->lock); + } +} + +/* + * Test whether parity in a given sector is suspected of being inconsistent + * on disk (assuming that any pending I/O to it is allowed to complete). + * This may be of interest to future work on parity scrubbing. + */ +int +rf_paritymap_test(struct rf_paritymap *pm, daddr_t sector) +{ + unsigned region = sector / pm->region_size; + int retval; + + mutex_enter(&pm->lock); + retval = isset(pm->disk_boot->bits, region) ? 1 : 0; + mutex_exit(&pm->lock); + return retval; +} + +/* To be called before a write to the RAID is submitted. */ +void +rf_paritymap_begin(struct rf_paritymap *pm, daddr_t offset, daddr_t size) +{ + unsigned i, b, e; + + b = offset / pm->region_size; + e = (offset + size - 1) / pm->region_size; + + for (i = b; i <= e; i++) + rf_paritymap_begin_region(pm, i); +} + +/* To be called after a write to the RAID completes. */ +void +rf_paritymap_end(struct rf_paritymap *pm, daddr_t offset, daddr_t size) +{ + unsigned i, b, e; + + b = offset / pm->region_size; + e = (offset + size - 1) / pm->region_size; + + for (i = b; i <= e; i++) + rf_paritymap_end_region(pm, i); +} + +void +rf_paritymap_begin_region(struct rf_paritymap *pm, unsigned region) +{ + int needs_write; + + KASSERT(region < RF_PARITYMAP_NREG); + pm->ctrs.nwrite++; + + /* If it was being kept warm, deal with that. */ + mutex_enter(&pm->lock); + if (pm->current->state[region] < 0) + pm->current->state[region] = 0; + + /* This shouldn't happen unless RAIDOUTSTANDING is set too high. */ + KASSERT(pm->current->state[region] < 127); + pm->current->state[region]++; + + needs_write = isclr(pm->disk_now->bits, region); + + if (needs_write) { + KASSERT(pm->current->state[region] == 1); + rf_paritymap_write_locked(pm); + } + + mutex_exit(&pm->lock); +} + +void +rf_paritymap_end_region(struct rf_paritymap *pm, unsigned region) +{ + KASSERT(region < RF_PARITYMAP_NREG); + + mutex_enter(&pm->lock); + KASSERT(pm->current->state[region] > 0); + --pm->current->state[region]; + + if (pm->current->state[region] <= 0) { + pm->current->state[region] = -pm->params.cooldown; + KASSERT(pm->current->state[region] <= 0); + mutex_enter(&pm->lk_flags); + if (!(pm->flags & TICKING)) { + pm->flags |= TICKING; + mutex_exit(&pm->lk_flags); + callout_schedule(&pm->ticker, + mstohz(pm->params.tickms)); + } else + mutex_exit(&pm->lk_flags); + } + mutex_exit(&pm->lock); +} + +/* + * Updates the parity map to account for any changes in current activity + * and/or an ongoing parity scan, then writes it to disk with appropriate + * synchronization. + */ +void +rf_paritymap_write(struct rf_paritymap *pm) +{ + mutex_enter(&pm->lock); + rf_paritymap_write_locked(pm); + mutex_exit(&pm->lock); +} + +/* As above, but to be used when pm->lock is already held. */ +static void +rf_paritymap_write_locked(struct rf_paritymap *pm) +{ + char w, w0; + int i, j, setting, clearing; + + setting = clearing = 0; + for (i = 0; i < RF_PARITYMAP_NBYTE; i++) { + w0 = pm->disk_now->bits[i]; + w = pm->disk_boot->bits[i]; + + for (j = 0; j < NBBY; j++) + if (pm->current->state[i * NBBY + j] != 0) + w |= 1 << j; + + if (w & ~w0) + setting = 1; + if (w0 & ~w) + clearing = 1; + + pm->disk_now->bits[i] = w; + } + pm->ctrs.ncachesync += setting + clearing; + pm->ctrs.nclearing += clearing; + + /* + * If bits are being set in the parity map, then a sync is + * required afterwards, so that the regions are marked dirty + * on disk before any writes to them take place. If bits are + * being cleared, then a sync is required before the write, so + * that any writes to those regions are processed before the + * region is marked clean. (Synchronization is somewhat + * overkill; a write ordering barrier would suffice, but we + * currently have no way to express that directly.) + */ + if (clearing) + rf_sync_component_caches(pm->raid); + rf_paritymap_kern_write(pm->raid, pm->disk_now); + if (setting) + rf_sync_component_caches(pm->raid); +} + +/* Mark all parity as being in need of rewrite. */ +void +rf_paritymap_invalidate(struct rf_paritymap *pm) +{ + mutex_enter(&pm->lock); + memset(pm->disk_boot, ~(unsigned char)0, + sizeof(struct rf_paritymap_ondisk)); + mutex_exit(&pm->lock); +} + +/* Mark all parity as being correct. */ +void +rf_paritymap_forceclean(struct rf_paritymap *pm) +{ + mutex_enter(&pm->lock); + memset(pm->disk_boot, (unsigned char)0, + sizeof(struct rf_paritymap_ondisk)); + mutex_exit(&pm->lock); +} + +/* + * The cooldown callout routine just defers its work to a thread; it can't do + * the parity map write itself as it would block, and although mutex-induced + * blocking is permitted it seems wise to avoid tying up the softint. + */ +static void +rf_paritymap_tick(void *arg) +{ + struct rf_paritymap *pm = arg; + + mutex_enter(&pm->lk_flags); + pm->flags |= TICKED; + mutex_exit(&pm->lk_flags); + wakeup(&(pm->raid->iodone)); /* XXX */ +} + +/* + * This is where the parity cooling work (and rearming the callout if needed) + * is done; the raidio thread calls it when woken up, as by the above. + */ +void +rf_paritymap_checkwork(struct rf_paritymap *pm) +{ + int i, zerop, progressp; + + mutex_enter(&pm->lk_flags); + if (pm->flags & TICKED) { + zerop = progressp = 0; + + pm->flags &= ~TICKED; + mutex_exit(&pm->lk_flags); + + mutex_enter(&pm->lock); + for (i = 0; i < RF_PARITYMAP_NREG; i++) { + if (pm->current->state[i] < 0) { + progressp = 1; + pm->current->state[i]++; + if (pm->current->state[i] == 0) + zerop = 1; + } + } + + if (progressp) + callout_schedule(&pm->ticker, + mstohz(pm->params.tickms)); + else { + mutex_enter(&pm->lk_flags); + pm->flags &= ~TICKING; + mutex_exit(&pm->lk_flags); + } + + if (zerop) + rf_paritymap_write_locked(pm); + mutex_exit(&pm->lock); + } else + mutex_exit(&pm->lk_flags); +} + +/* + * Set parity map parameters; used both to alter parameters on the fly and to + * establish their initial values. Note that setting a parameter to 0 means + * to leave the previous setting unchanged, and that if this is done for the + * initial setting of "regions", then a default value will be computed based + * on the RAID component size. + */ +int +rf_paritymap_set_params(struct rf_paritymap *pm, + const struct rf_pmparams *params, int todisk) +{ + int cooldown, tickms; + u_int regions; + RF_RowCol_t col; + RF_ComponentLabel_t *clabel; + RF_Raid_t *raidPtr; + + cooldown = params->cooldown != 0 + ? params->cooldown : pm->params.cooldown; + tickms = params->tickms != 0 + ? params->tickms : pm->params.tickms; + regions = params->regions != 0 + ? params->regions : pm->params.regions; + + if (cooldown < 1 || cooldown > 128) { + printf("raid%d: cooldown %d out of range\n", pm->raid->raidid, + cooldown); + return (-1); + } + if (tickms < 10) { + printf("raid%d: tick time %dms out of range\n", + pm->raid->raidid, tickms); + return (-1); + } + if (regions == 0) { + regions = rf_paritymap_nreg(pm->raid); + } else if (regions > RF_PARITYMAP_NREG) { + printf("raid%d: region count %u too large (more than %u)\n", + pm->raid->raidid, regions, RF_PARITYMAP_NREG); + return (-1); + } + + /* XXX any currently warm parity will be used with the new tickms! */ + pm->params.cooldown = cooldown; + pm->params.tickms = tickms; + /* Apply the initial region count, but do not change it after that. */ + if (pm->params.regions == 0) + pm->params.regions = regions; + + /* So that the newly set parameters can be tested: */ + pm->ctrs.nwrite = pm->ctrs.ncachesync = pm->ctrs.nclearing = 0; + + if (todisk) { + raidPtr = pm->raid; + for (col = 0; col < raidPtr->numCol; col++) { + clabel = raidget_component_label(raidPtr, col); + clabel->parity_map_ntick = cooldown; + clabel->parity_map_tickms = tickms; + clabel->parity_map_regions = regions; + raidflush_component_label(raidPtr, col); + } + } + return 0; +} + +/* + * The number of regions may not be as many as can fit into the map, because + * when regions are too small, the overhead of setting parity map bits + * becomes significant in comparison to the actual I/O, while the + * corresponding gains in parity verification time become negligible. Thus, + * a minimum region size (defined above) is imposed. + * + * Note that, if the number of regions is less than the maximum, then some of + * the regions will be "fictional", corresponding to no actual disk; some + * parts of the code may process them as normal, but they can not ever be + * written to. + */ +static u_int +rf_paritymap_nreg(RF_Raid_t *raid) +{ + daddr_t bytes_per_disk, nreg; + + bytes_per_disk = raid->sectorsPerDisk << raid->logBytesPerSector; + nreg = bytes_per_disk / REGION_MINSIZE; + if (nreg > RF_PARITYMAP_NREG) + nreg = RF_PARITYMAP_NREG; + + return (u_int)nreg; +} + +/* + * Initialize a parity map given specific parameters. This neither reads nor + * writes the parity map config in the component labels; for that, see below. + */ +int +rf_paritymap_init(struct rf_paritymap *pm, RF_Raid_t *raid, + const struct rf_pmparams *params) +{ + daddr_t rstripes; + struct rf_pmparams safe; + + pm->raid = raid; + pm->params.regions = 0; + if (0 != rf_paritymap_set_params(pm, params, 0)) { + /* + * If the parameters are out-of-range, then bring the + * parity map up with something reasonable, so that + * the admin can at least go and fix it (or ignore it + * entirely). + */ + safe.cooldown = DFL_COOLDOWN; + safe.tickms = DFL_TICKMS; + safe.regions = 0; + + if (0 != rf_paritymap_set_params(pm, &safe, 0)) + return (-1); + } + + rstripes = howmany(raid->Layout.numStripe, pm->params.regions); + pm->region_size = rstripes * raid->Layout.dataSectorsPerStripe; + + callout_init(&pm->ticker, CALLOUT_MPSAFE); + callout_setfunc(&pm->ticker, rf_paritymap_tick, pm); + pm->flags = 0; + + pm->disk_boot = kmem_alloc(sizeof(struct rf_paritymap_ondisk), + KM_SLEEP); + pm->disk_now = kmem_alloc(sizeof(struct rf_paritymap_ondisk), + KM_SLEEP); + pm->current = kmem_zalloc(sizeof(struct rf_paritymap_current), + KM_SLEEP); + + rf_paritymap_kern_read(pm->raid, pm->disk_boot); + memcpy(pm->disk_now, pm->disk_boot, sizeof(*pm->disk_now)); + + mutex_init(&pm->lock, MUTEX_DEFAULT, IPL_NONE); + mutex_init(&pm->lk_flags, MUTEX_DEFAULT, IPL_SOFTCLOCK); + + return 0; +} + +/* + * Destroys a parity map; unless "force" is set, also cleans parity for any + * regions which were still in cooldown (but are not dirty on disk). + */ +void +rf_paritymap_destroy(struct rf_paritymap *pm, int force) +{ + int i; + + callout_halt(&pm->ticker, NULL); /* XXX stop? halt? */ + callout_destroy(&pm->ticker); + + if (!force) { + for (i = 0; i < RF_PARITYMAP_NREG; i++) { + /* XXX check for > 0 ? */ + if (pm->current->state[i] < 0) + pm->current->state[i] = 0; + } + + rf_paritymap_write_locked(pm); + } + + mutex_destroy(&pm->lock); + mutex_destroy(&pm->lk_flags); + + kmem_free(pm->disk_boot, sizeof(struct rf_paritymap_ondisk)); + kmem_free(pm->disk_now, sizeof(struct rf_paritymap_ondisk)); + kmem_free(pm->current, sizeof(struct rf_paritymap_current)); +} + +/* + * Rewrite parity, taking parity map into account; this is the equivalent of + * the old rf_RewriteParity, and is likewise to be called from a suitable + * thread and shouldn't have multiple copies running in parallel and so on. + * + * Note that the fictional regions are "cleaned" in one shot, so that very + * small RAIDs (useful for testing) will not experience potentially severe + * regressions in rewrite time. + */ +int +rf_paritymap_rewrite(struct rf_paritymap *pm) +{ + int i, ret_val = 0; + daddr_t reg_b, reg_e; + + /* Process only the actual regions. */ + for (i = 0; i < pm->params.regions; i++) { + mutex_enter(&pm->lock); + if (isset(pm->disk_boot->bits, i)) { + mutex_exit(&pm->lock); + + reg_b = i * pm->region_size; + reg_e = reg_b + pm->region_size; + if (reg_e > pm->raid->totalSectors) + reg_e = pm->raid->totalSectors; + + if (rf_RewriteParityRange(pm->raid, reg_b, + reg_e - reg_b)) { + ret_val = 1; + if (pm->raid->waitShutdown) + return ret_val; + } else { + mutex_enter(&pm->lock); + clrbit(pm->disk_boot->bits, i); + rf_paritymap_write_locked(pm); + mutex_exit(&pm->lock); + } + } else { + mutex_exit(&pm->lock); + } + } + + /* Now, clear the fictional regions, if any. */ + rf_paritymap_forceclean(pm); + rf_paritymap_write(pm); + + return ret_val; +} + +/* + * How to merge the on-disk parity maps when reading them in from the + * various components; returns whether they differ. In the case that + * they do differ, sets *dst to the union of *dst and *src. + * + * In theory, it should be safe to take the intersection (or just pick + * a single component arbitrarily), but the paranoid approach costs + * little. + * + * Appropriate locking, if any, is the responsibility of the caller. + */ +int +rf_paritymap_merge(struct rf_paritymap_ondisk *dst, + struct rf_paritymap_ondisk *src) +{ + int i, discrep = 0; + + for (i = 0; i < RF_PARITYMAP_NBYTE; i++) { + if (dst->bits[i] != src->bits[i]) + discrep = 1; + dst->bits[i] |= src->bits[i]; + } + + return discrep; +} + +/* + * Detach a parity map from its RAID. This is not meant to be applied except + * when unconfiguring the RAID after all I/O has been resolved, as otherwise + * an out-of-date parity map could be treated as current. + */ +void +rf_paritymap_detach(RF_Raid_t *raidPtr) +{ + if (raidPtr->parity_map == NULL) + return; + + simple_lock(&(raidPtr->iodone_lock)); + struct rf_paritymap *pm = raidPtr->parity_map; + raidPtr->parity_map = NULL; + simple_unlock(&(raidPtr->iodone_lock)); + /* XXXjld is that enough locking? Or too much? */ + rf_paritymap_destroy(pm, 0); + kmem_free(pm, sizeof(*pm)); +} + +/* + * Attach a parity map to a RAID set if appropriate. Includes + * configure-time processing of parity-map fields of component label. + */ +void +rf_paritymap_attach(RF_Raid_t *raidPtr, int force) +{ + RF_RowCol_t col; + int pm_use, pm_zap; + int g_tickms, g_ntick, g_regions; + int good; + RF_ComponentLabel_t *clabel; + u_int flags, regions; + struct rf_pmparams params; + + if (raidPtr->Layout.map->faultsTolerated == 0) { + /* There isn't any parity. */ + return; + } + + pm_use = 1; + pm_zap = 0; + g_tickms = DFL_TICKMS; + g_ntick = DFL_COOLDOWN; + g_regions = 0; + + /* + * Collect opinions on the set config. If this is the initial + * config (raidctl -C), treat all labels as invalid, since + * there may be random data present. + */ + if (!force) { + for (col = 0; col < raidPtr->numCol; col++) { + clabel = raidget_component_label(raidPtr, col); + flags = clabel->parity_map_flags; + /* Check for use by non-parity-map kernel. */ + if (clabel->parity_map_modcount + != clabel->mod_counter) { + flags &= ~RF_PMLABEL_WASUSED; + } + + if (flags & RF_PMLABEL_VALID) { + g_tickms = clabel->parity_map_tickms; + g_ntick = clabel->parity_map_ntick; + regions = clabel->parity_map_regions; + if (g_regions == 0) + g_regions = regions; + else if (g_regions != regions) { + pm_zap = 1; /* important! */ + } + + if (flags & RF_PMLABEL_DISABLE) { + pm_use = 0; + } + if (!(flags & RF_PMLABEL_WASUSED)) { + pm_zap = 1; + } + } else { + pm_zap = 1; + } + } + } else { + pm_zap = 1; + } + + /* Finally, create and attach the parity map. */ + if (pm_use) { + params.cooldown = g_ntick; + params.tickms = g_tickms; + params.regions = g_regions; + + raidPtr->parity_map = kmem_alloc(sizeof(struct rf_paritymap), + KM_SLEEP); + if (0 != rf_paritymap_init(raidPtr->parity_map, raidPtr, + ¶ms)) { + /* It failed; do without. */ + kmem_free(raidPtr->parity_map, + sizeof(struct rf_paritymap)); + raidPtr->parity_map = NULL; + return; + } + + if (g_regions == 0) + /* Pick up the autoconfigured region count. */ + g_regions = raidPtr->parity_map->params.regions; + + if (pm_zap) { + good = raidPtr->parity_good && !force; + + if (good) + rf_paritymap_forceclean(raidPtr->parity_map); + else + rf_paritymap_invalidate(raidPtr->parity_map); + /* This needs to be on disk before WASUSED is set. */ + rf_paritymap_write(raidPtr->parity_map); + } + } + + /* Alter labels in-core to reflect the current view of things. */ + for (col = 0; col < raidPtr->numCol; col++) { + clabel = raidget_component_label(raidPtr, col); + + if (pm_use) + flags = RF_PMLABEL_VALID | RF_PMLABEL_WASUSED; + else + flags = RF_PMLABEL_VALID | RF_PMLABEL_DISABLE; + + clabel->parity_map_flags = flags; + clabel->parity_map_tickms = g_tickms; + clabel->parity_map_ntick = g_ntick; + clabel->parity_map_regions = g_regions; + raidflush_component_label(raidPtr, col); + } +} + +/* + * For initializing the parity-map fields of a component label, both on + * initial creation and on reconstruct/copyback/etc. + */ +void +rf_paritymap_init_label(struct rf_paritymap *pm, RF_ComponentLabel_t *clabel) +{ + if (pm != NULL) { + clabel->parity_map_flags = + RF_PMLABEL_VALID | RF_PMLABEL_WASUSED; + clabel->parity_map_tickms = pm->params.tickms; + clabel->parity_map_ntick = pm->params.cooldown; + /* + * XXXjld: If the number of regions is changed on disk, and + * then a new component is labeled before the next configure, + * then it will get the old value and they will conflict on + * the next boot (and the default will be used instead). + */ + clabel->parity_map_regions = pm->params.regions; + } else { + /* + * XXXjld: if the map is disabled, and all the components are + * replaced without an intervening unconfigure/reconfigure, + * then it will become enabled on the next unconfig/reconfig. + */ + } +} + + +/* Will the parity map be disabled next time? */ +int +rf_paritymap_get_disable(RF_Raid_t *raidPtr) +{ + RF_ComponentLabel_t *clabel; + RF_RowCol_t col; + int dis; + + dis = 0; + for (col = 0; col < raidPtr->numCol; col++) { + clabel = raidget_component_label(raidPtr, col); + if (clabel->parity_map_flags & RF_PMLABEL_DISABLE) + dis = 1; + } + + return dis; +} + +/* Set whether the parity map will be disabled next time. */ +void +rf_paritymap_set_disable(RF_Raid_t *raidPtr, int dis) +{ + RF_ComponentLabel_t *clabel; + RF_RowCol_t col; + + for (col = 0; col < raidPtr->numCol; col++) { + clabel = raidget_component_label(raidPtr, col); + if (dis) + clabel->parity_map_flags |= RF_PMLABEL_DISABLE; + else + clabel->parity_map_flags &= ~RF_PMLABEL_DISABLE; + raidflush_component_label(raidPtr, col); + } +} Index: sys/dev/raidframe/rf_paritymap.h =================================================================== RCS file: sys/dev/raidframe/rf_paritymap.h diff -N sys/dev/raidframe/rf_paritymap.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/raidframe/rf_paritymap.h 1 Nov 2009 21:29:17 -0000 @@ -0,0 +1,125 @@ +/* $NetBSD$ */ + +/*- + * Copyright (c) 2009 Jed Davis. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include + +#include + +/* RF_PARITYMAP_N* in raidframevar.h */ + +#define RF_PMLABEL_VALID 1 +#define RF_PMLABEL_WASUSED 2 +#define RF_PMLABEL_DISABLE 4 + +/* + * On-disk format: a single bit for each region; if the bit is clear, + * then the parity is clean. + */ +struct rf_paritymap_ondisk +{ + /* XXX Do these really need to be volatile? */ + volatile char bits[RF_PARITYMAP_NBYTE]; +}; + +/* In-core per-region state: a byte for each, encoded as follows. */ +struct rf_paritymap_current +{ + volatile int8_t state[RF_PARITYMAP_NREG]; + /* + * Values: + * if x == 0, the region may be written out as clean + * if x > 0, then x outstanding IOs to that region + * if x < 0, then there was recently IO; periodically increment x + */ +}; + +/* The entire state. */ +struct rf_paritymap +{ + struct rf_paritymap_ondisk *disk_boot, *disk_now; + struct rf_paritymap_current *current; + + /* + * This lock will be held while component disks' caches are + * flushed, which could take many milliseconds, so it should + * not be taken where that kind of delay is unacceptable. + * Contention on this lock is not, however, expected to be a + * performance bottleneck. + */ + kmutex_t lock; + /* + * The flags field, below, has its own lock so that + * inter-thread communication can occur without taking the + * overall lock. Ordering is lock -> lk_flags. + */ + kmutex_t lk_flags; + + RF_Raid_t *raid; + daddr_t region_size; + callout_t ticker; + struct rf_pmparams params; + volatile int flags; + struct rf_pmctrs ctrs; +}; + +void rf_paritymap_status(struct rf_paritymap *, struct rf_pmstat *); + +int rf_paritymap_test(struct rf_paritymap *, daddr_t); +void rf_paritymap_begin_region(struct rf_paritymap *, unsigned); +void rf_paritymap_begin(struct rf_paritymap *, daddr_t, daddr_t); +void rf_paritymap_end_region(struct rf_paritymap *, unsigned); +void rf_paritymap_end(struct rf_paritymap *, daddr_t, daddr_t); + +void rf_paritymap_checkwork(struct rf_paritymap *); +void rf_paritymap_invalidate(struct rf_paritymap *); +void rf_paritymap_forceclean(struct rf_paritymap *); +void rf_paritymap_write(struct rf_paritymap *); + +int rf_paritymap_init(struct rf_paritymap *, RF_Raid_t *, + const struct rf_pmparams *); +void rf_paritymap_destroy(struct rf_paritymap *, int); + +int rf_paritymap_rewrite(struct rf_paritymap *); + +int rf_paritymap_merge(struct rf_paritymap_ondisk *, + struct rf_paritymap_ondisk *); + +void rf_paritymap_attach(RF_Raid_t *, int); +void rf_paritymap_detach(RF_Raid_t *); /* Not while the RAID is live! */ + +int rf_paritymap_get_disable(RF_Raid_t *); +void rf_paritymap_set_disable(RF_Raid_t *, int); + +int rf_paritymap_set_params(struct rf_paritymap *, + const struct rf_pmparams *, int); + +void rf_paritymap_init_label(struct rf_paritymap *, + RF_ComponentLabel_t *); Index: sys/dev/raidframe/rf_parityscan.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_parityscan.c,v retrieving revision 1.32 diff -u -p -r1.32 rf_parityscan.c --- sys/dev/raidframe/rf_parityscan.c 16 Nov 2006 01:33:23 -0000 1.32 +++ sys/dev/raidframe/rf_parityscan.c 1 Nov 2009 21:29:17 -0000 @@ -46,6 +46,7 @@ __KERNEL_RCSID(0, "$NetBSD: rf_paritysca #include "rf_engine.h" #include "rf_parityscan.h" #include "rf_map.h" +#include "rf_paritymap.h" /***************************************************************************** * @@ -63,6 +64,20 @@ __KERNEL_RCSID(0, "$NetBSD: rf_paritysca int rf_RewriteParity(RF_Raid_t *raidPtr) { + if (raidPtr->parity_map != NULL) + return rf_paritymap_rewrite(raidPtr->parity_map); + else + return rf_RewriteParityRange(raidPtr, 0, raidPtr->totalSectors); +} + +int +rf_RewriteParityRange(RF_Raid_t *raidPtr, RF_SectorNum_t sec_begin, + RF_SectorNum_t sec_len) +{ + /* + * Note: It is the caller's responsibility to ensure that + * sec_begin and sec_len are stripe-aligned. + */ RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; RF_AccessStripeMapHeader_t *asm_h; int ret_val; @@ -86,7 +101,7 @@ rf_RewriteParity(RF_Raid_t *raidPtr) rc = RF_PARITY_OKAY; - for (i = 0; i < raidPtr->totalSectors && + for (i = sec_begin; i < sec_begin + sec_len && rc <= RF_PARITY_CORRECTED; i += layoutPtr->dataSectorsPerStripe) { if (raidPtr->waitShutdown) { Index: sys/dev/raidframe/rf_parityscan.h =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_parityscan.h,v retrieving revision 1.7 diff -u -p -r1.7 rf_parityscan.h --- sys/dev/raidframe/rf_parityscan.h 11 Dec 2005 12:23:37 -0000 1.7 +++ sys/dev/raidframe/rf_parityscan.h 1 Nov 2009 21:29:17 -0000 @@ -34,6 +34,7 @@ #include "rf_alloclist.h" int rf_RewriteParity(RF_Raid_t *); +int rf_RewriteParityRange(RF_Raid_t *, RF_SectorNum_t, RF_SectorNum_t); int rf_VerifyParityBasic(RF_Raid_t *, RF_RaidAddr_t, RF_PhysDiskAddr_t *, int, RF_RaidAccessFlags_t); int rf_VerifyParity(RF_Raid_t *, RF_AccessStripeMap_t *, int, Index: sys/dev/raidframe/rf_raid.h =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_raid.h,v retrieving revision 1.37 diff -u -p -r1.37 rf_raid.h --- sys/dev/raidframe/rf_raid.h 16 Sep 2007 02:13:35 -0000 1.37 +++ sys/dev/raidframe/rf_raid.h 1 Nov 2009 21:29:17 -0000 @@ -137,7 +137,7 @@ struct RF_Raid_s { int parity_good; /* !0 if parity is known to be correct */ int serial_number; /* a "serial number" for this set */ int mod_counter; /* modification counter for component labels */ - int clean; /* the clean bit for this array. */ + int clean; /* completely unused and should be removed */ int openings; /* Number of IO's which can be scheduled simultaneously (high-level - not a @@ -295,5 +295,6 @@ struct RF_Raid_s { RF_Thread_t pLogDiskThreadHandle; #endif /* RF_INCLUDE_PARITYLOGGING > 0 */ + struct rf_paritymap *parity_map; }; #endif /* !_RF__RF_RAID_H_ */ Index: sys/dev/raidframe/rf_reconstruct.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_reconstruct.c,v retrieving revision 1.105.4.2 diff -u -p -r1.105.4.2 rf_reconstruct.c --- sys/dev/raidframe/rf_reconstruct.c 19 Feb 2009 20:27:08 -0000 1.105.4.2 +++ sys/dev/raidframe/rf_reconstruct.c 1 Nov 2009 21:30:39 -0000 @@ -234,7 +234,7 @@ rf_ReconstructFailedDisk(RF_Raid_t *raid int rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t col) { - RF_ComponentLabel_t c_label; + RF_ComponentLabel_t *c_label; RF_RaidDisk_t *spareDiskPtr = NULL; RF_RaidReconDesc_t *reconDesc; RF_RowCol_t scol; @@ -289,17 +289,14 @@ rf_ReconstructFailedDiskBasic(RF_Raid_t if (!rc) { /* fix up the component label */ /* Don't actually need the read here.. */ - raidread_component_label( - raidPtr->raid_cinfo[scol].ci_dev, - raidPtr->raid_cinfo[scol].ci_vp, - &c_label); - - raid_init_component_label( raidPtr, &c_label); - c_label.row = 0; - c_label.column = col; - c_label.clean = RF_RAID_DIRTY; - c_label.status = rf_ds_optimal; - c_label.partitionSize = raidPtr->Disks[scol].partitionSize; + c_label = raidget_component_label(raidPtr, scol); + + raid_init_component_label(raidPtr, c_label); + c_label->row = 0; + c_label->column = col; + c_label->clean = RF_RAID_DIRTY; + c_label->status = rf_ds_optimal; + c_label->partitionSize = raidPtr->Disks[scol].partitionSize; /* We've just done a rebuild based on all the other disks, so at this point the parity is known to be @@ -313,11 +310,7 @@ rf_ReconstructFailedDiskBasic(RF_Raid_t /* XXXX MORE NEEDED HERE */ - raidwrite_component_label( - raidPtr->raid_cinfo[scol].ci_dev, - raidPtr->raid_cinfo[scol].ci_vp, - &c_label); - + raidflush_component_label(raidPtr, scol); } else { /* Reconstruct failed. */ @@ -350,7 +343,7 @@ rf_ReconstructInPlace(RF_Raid_t *raidPtr RF_RaidDisk_t *spareDiskPtr = NULL; RF_RaidReconDesc_t *reconDesc; const RF_LayoutSW_t *lp; - RF_ComponentLabel_t c_label; + RF_ComponentLabel_t *c_label; int numDisksDone = 0, rc; struct partinfo dpart; struct vnode *vp; @@ -515,15 +508,13 @@ rf_ReconstructInPlace(RF_Raid_t *raidPtr /* fix up the component label */ /* Don't actually need the read here.. */ - raidread_component_label(raidPtr->raid_cinfo[col].ci_dev, - raidPtr->raid_cinfo[col].ci_vp, - &c_label); + c_label = raidget_component_label(raidPtr, col); RF_LOCK_MUTEX(raidPtr->mutex); - raid_init_component_label(raidPtr, &c_label); + raid_init_component_label(raidPtr, c_label); - c_label.row = 0; - c_label.column = col; + c_label->row = 0; + c_label->column = col; /* We've just done a rebuild based on all the other disks, so at this point the parity is known to be @@ -534,10 +525,7 @@ rf_ReconstructInPlace(RF_Raid_t *raidPtr raidPtr->parity_good = RF_RAID_CLEAN; RF_UNLOCK_MUTEX(raidPtr->mutex); - raidwrite_component_label(raidPtr->raid_cinfo[col].ci_dev, - raidPtr->raid_cinfo[col].ci_vp, - &c_label); - + raidflush_component_label(raidPtr, col); } else { /* Reconstruct-in-place failed. Disk goes back to "failed" status, regardless of what it was before. */ Index: sys/dev/raidframe/rf_states.c =================================================================== RCS file: /bag/nb/repo/src/sys/dev/raidframe/rf_states.c,v retrieving revision 1.43 diff -u -p -r1.43 rf_states.c --- sys/dev/raidframe/rf_states.c 20 May 2008 00:29:54 -0000 1.43 +++ sys/dev/raidframe/rf_states.c 1 Nov 2009 21:29:17 -0000 @@ -45,6 +45,7 @@ __KERNEL_RCSID(0, "$NetBSD: rf_states.c, #include "rf_map.h" #include "rf_etimer.h" #include "rf_kintf.h" +#include "rf_paritymap.h" #ifndef RF_DEBUG_STATES #define RF_DEBUG_STATES 0 @@ -237,6 +238,15 @@ rf_State_LastState(RF_RaidAccessDesc_t * wakeup(&(desc->raidPtr->iodone)); + /* + * The parity_map hook has to go here, because the iodone + * callback goes straight into the kintf layer. + */ + if (desc->raidPtr->parity_map != NULL && + desc->type == RF_IO_TYPE_WRITE) + rf_paritymap_end(desc->raidPtr->parity_map, + desc->raidAddress, desc->numBlocks); + /* printf("Calling biodone on 0x%x\n",desc->bp); */ biodone(desc->bp); /* access came through ioctl */ Index: sbin/raidctl/raidctl.8 =================================================================== RCS file: /bag/nb/repo/src/sbin/raidctl/raidctl.8,v retrieving revision 1.56 diff -u -p -r1.56 raidctl.8 --- sbin/raidctl/raidctl.8 28 Aug 2008 21:24:30 -0000 1.56 +++ sbin/raidctl/raidctl.8 1 Nov 2009 21:29:17 -0000 @@ -96,6 +96,16 @@ .Fl I Ar serial_number Ar dev .Nm .Op Fl v +.Fl m Ar dev +.Nm +.Op Fl v +.Fl M +.Oo yes | no | set +.Ar params +.Oc +.Ar dev +.Nm +.Op Fl v .Fl p Ar dev .Nm .Op Fl v @@ -222,6 +232,44 @@ different RAID sets. This step .Em MUST be performed when a new RAID set is created. +.It Fl m Ar dev +Display status information about the parity map on the RAID set, if any. +If used with +.Fl v +then the current contents of the parity map will be output (in +hexadecimal format) as well. +.It Fl M Ic yes Ar dev +.\"XXX should there be a section with more info on the parity map feature? +Enable the use of a parity map on the RAID set; this is the default, +and greatly reduces the time taken to check parity after unclean +shutdowns at the cost of some very slight overhead during normal +operation. +Changes to this setting will take effect the next time the set is +configured. +Note that RAID-0 sets, having no parity, will not use a parity map in +any case. +.It Fl M Ic no Ar dev +Disable the use of a parity map on the RAID set; doing this is not +recommended. +This will take effect the next time the set is configured. +.It Fl M Ic set Ar cooldown Ar tickms Ar regions Ar dev +Alter the parameters of the parity map; parameters to leave unchanged +can be given as 0, and trailing zeroes may be omitted. +.\"XXX should this explanation be deferred to another section as well? +The RAID set is divided into +.Ar regions +regions; each region is marked dirty for at most +.Ar cooldown +intervals of +.Ar tickms +milliseconds each after a write to it, and at least +.Ar cooldown +\- 1 such intervals. +Changes to +.Ar regions +take effect the next time is configured, while changes to the other +parameters are applied immediately. +The default parameters are expected to be reasonable for most workloads. .It Fl p Ar dev Check the status of the parity on the RAID set. Displays a status message, Index: sbin/raidctl/raidctl.c =================================================================== RCS file: /bag/nb/repo/src/sbin/raidctl/raidctl.c,v retrieving revision 1.39.4.1 diff -u -p -r1.39.4.1 raidctl.c --- sbin/raidctl/raidctl.c 1 Feb 2009 23:41:37 -0000 1.39.4.1 +++ sbin/raidctl/raidctl.c 1 Nov 2009 21:30:25 -0000 @@ -82,13 +82,15 @@ static void check_parity(int,int, char static void do_meter(int, u_long); static void get_bar(char *, double, int); static void get_time_string(char *, int); +static void rf_output_pmstat(int, int); +static void rf_pm_configure(int, int, char *, int[]); int verbose; int main(int argc,char *argv[]) { - int ch; + int ch, i; int num_options; unsigned long action; char config_filename[PATH_MAX]; @@ -96,6 +98,8 @@ main(int argc,char *argv[]) char name[PATH_MAX]; char component[PATH_MAX]; char autoconf[10]; + char *parityconf = NULL; + int parityparams[3]; int do_output; int do_recon; int do_rewrite; @@ -117,7 +121,7 @@ main(int argc,char *argv[]) force = 0; openmode = O_RDWR; /* default to read/write */ - while ((ch = getopt(argc, argv, "a:A:Bc:C:f:F:g:GiI:l:r:R:sSpPuv")) + while ((ch = getopt(argc, argv, "a:A:Bc:C:f:F:g:GiI:l:mM:r:R:sSpPuv")) != -1) switch(ch) { case 'a': @@ -181,6 +185,23 @@ main(int argc,char *argv[]) serial_number = atoi(optarg); num_options++; break; + case 'm': + action = RAIDFRAME_PARITYMAP_STATUS; + openmode = O_RDONLY; + num_options++; + break; + case 'M': + action = RAIDFRAME_PARITYMAP_SET_DISABLE; + parityconf = strdup(optarg); + num_options++; + /* XXXjld: should rf_pm_configure do the atoi()s? */ + i = 0; + while (i < 3 && optind < argc && + isdigit((int)argv[optind][0])) + parityparams[i++] = atoi(argv[optind++]); + while (i < 3) + parityparams[i++] = 0; + break; case 'l': action = RAIDFRAME_SET_COMPONENT_LABEL; strlcpy(component, optarg, sizeof(component)); @@ -308,6 +329,12 @@ main(int argc,char *argv[]) else rf_get_device_status(fd); break; + case RAIDFRAME_PARITYMAP_STATUS: + rf_output_pmstat(fd, raidID); + break; + case RAIDFRAME_PARITYMAP_SET_DISABLE: + rf_pm_configure(fd, raidID, parityconf, parityparams); + break; case RAIDFRAME_REBUILD_IN_PLACE: rebuild_in_place(fd, component); break; @@ -455,6 +482,105 @@ rf_get_device_status(int fd) } static void +rf_output_pmstat(int fd, int raidID) +{ + char srs[7]; + int i, j, dr; + int dis; + struct rf_pmstat st; + + do_ioctl(fd, RAIDFRAME_PARITYMAP_STATUS, &st, + "RAIDFRAME_PARITYMAP_STATUS"); + if (st.enabled) { + if (0 > humanize_number(srs, 7, st.region_size * DEV_BSIZE, + "B", HN_AUTOSCALE, HN_NOSPACE)) + strlcpy(srs, "???", 7); + + printf("raid%d: parity map enabled with %u regions of %s\n", + raidID, st.params.regions, srs); + printf("raid%d: parity cleaned after %d intervals of" + " %d.%03ds\n", raidID, st.params.cooldown, + st.params.tickms / 1000, st.params.tickms % 1000); + printf("raid%d: write/sync/clean counters " + "%"PRIu64"/%"PRIu64"/%"PRIu64"\n", raidID, + st.ctrs.nwrite, st.ctrs.ncachesync, st.ctrs.nclearing); + + dr = 0; + for (i = 0; i < RF_PARITYMAP_NREG; i++) + if (isset(st.dirty, i)) + dr++; + printf("raid%d: %d dirty region%s\n", raidID, dr, + dr == 1 ? "" : "s"); + + if (verbose > 0) { + for (i = 0; i < RF_PARITYMAP_NBYTE; i += 32) { + printf(" "); + for (j = i; j < RF_PARITYMAP_NBYTE + && j < i + 32; j++) + printf("%x%x", st.dirty[j] & 15, + (st.dirty[j] >> 4) & 15); + printf("\n"); + } + } + } else { + printf("raid%d: parity map disabled\n", raidID); + } + + do_ioctl(fd, RAIDFRAME_PARITYMAP_GET_DISABLE, &dis, + "RAIDFRAME_PARITYMAP_GET_DISABLE"); + printf("raid%d: parity map will %s %sabled on next configure\n", + raidID, dis == st.enabled ? "be" : "remain", dis ? "dis" : "en"); +} + +static void +rf_pm_configure(int fd, int raidID, char *parityconf, int parityparams[]) +{ + int dis; + struct rf_pmparams params; + + if (strcasecmp(parityconf, "yes") == 0) + dis = 0; + else if (strcasecmp(parityconf, "no") == 0) + dis = 1; + else if (strcasecmp(parityconf, "set") == 0) { + params.cooldown = parityparams[0]; + params.tickms = parityparams[1]; + params.regions = parityparams[2]; + + do_ioctl(fd, RAIDFRAME_PARITYMAP_SET_PARAMS, ¶ms, + "RAIDFRAME_PARITYMAP_SET_PARAMS"); + + if (params.cooldown != 0 || params.tickms != 0) { + printf("raid%d: parity cleaned after", raidID); + if (params.cooldown != 0) + printf(" %d", params.cooldown); + printf(" intervals"); + if (params.tickms != 0) { + printf(" of %d.%03ds", params.tickms / 1000, + params.tickms % 1000); + } + printf("\n"); + } + if (params.regions != 0) + printf("raid%d: will use %d regions on next" + " configuration\n", raidID, params.regions); + + return; + /* XXX the control flow here could be prettier. */ + } else { + fprintf(stderr, "%s: \"%s\" is not a valid parity map command" + "\n", getprogname(), parityconf); + exit(1); + } + + do_ioctl(fd, RAIDFRAME_PARITYMAP_SET_DISABLE, &dis, + "RAIDFRAME_PARITYMAP_SET_DISABLE"); + printf("raid%d: parity map will be %sabled on next configure\n", + raidID, dis ? "dis" : "en"); +} + + +static void rf_output_configuration(int fd, const char *name) { RF_DeviceConfig_t device_config; @@ -1022,7 +1148,7 @@ usage(void) const char *progname = getprogname(); fprintf(stderr, "usage: %s [-v] -a component dev\n", progname); - fprintf(stderr, " %s [-v] -A yes | no | root dev\n", progname); + fprintf(stderr, " %s [-v] -A [yes | no | root] dev\n", progname); fprintf(stderr, " %s [-v] -B dev\n", progname); fprintf(stderr, " %s [-v] -c config_file dev\n", progname); fprintf(stderr, " %s [-v] -C config_file dev\n", progname); @@ -1032,6 +1158,9 @@ usage(void) fprintf(stderr, " %s [-v] -G dev\n", progname); fprintf(stderr, " %s [-v] -i dev\n", progname); fprintf(stderr, " %s [-v] -I serial_number dev\n", progname); + fprintf(stderr, " %s [-v] -m dev\n", progname); + fprintf(stderr, " %s [-v] -M [yes | no | set params] dev\n", + progname); fprintf(stderr, " %s [-v] -p dev\n", progname); fprintf(stderr, " %s [-v] -P dev\n", progname); fprintf(stderr, " %s [-v] -r component dev\n", progname);