Index: external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c =================================================================== RCS file: /cvsroot/src/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c,v retrieving revision 1.20 diff -d -p -u -r1.20 vdev_disk.c --- external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c 16 Apr 2022 07:56:45 -0000 1.20 +++ external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c 1 Jun 2026 11:19:16 -0000 @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -35,6 +36,7 @@ #include #include #include +#include #include #include @@ -98,48 +100,627 @@ vdev_disk_free(vdev_t *vd) } -/* - * It's not clear what these hold/rele functions are supposed to do. - */ static void -vdev_disk_hold(vdev_t *vd) +vdev_disk_flush(struct work *work, void *cookie) { + vdev_disk_t *dvd; + int error, cmd; + buf_t *bp; + vnode_t *vp; - ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + bp = (struct buf *)work; + vp = bp->b_vp; + dvd = cookie; + + KASSERT(vp == dvd->vd_vp); + + cmd = 1; + error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FWRITE, kcred); + bp->b_error = error; + vdev_disk_io_intr(bp); +} + +static boolean_t +device_is_eligible_for_vdev(device_t device) +{ + if (device_class(device) != DV_DISK) + return B_FALSE; + + /* XXX better check? */ + /* XXX raidframe excludes devices instead of including devices */ + if (device_is_a(device, "dk") || + device_is_a(device, "ld") || + device_is_a(device, "sd") || + device_is_a(device, "wd") || + device_is_a(device, "xbd")) { + return B_TRUE; + } + return B_FALSE; +} + +static int +vdev_disk_attach_path(vdev_t *vd, vnode_t **vpp) +{ + + ZFS_LOG(2, "path = \"%s\"", vd->vdev_path); + return vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0, vpp, + CRCREAT, 0); +} + +static int +vdev_disk_attach_dev(dev_t dev, vnode_t **vpp) +{ + int error; + + ZFS_LOG(2, "dev = %#"PRIx64, dev); + + error = bdevvp(dev, vpp); + if (error) + return error; + + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); + ZFS_LOG(2, "call VOP_OPEN(vp %p)", *vpp); + error = VOP_OPEN(*vpp, FREAD|FWRITE, NOCRED); + + if (error) { + ZFS_LOG(2, "VOP_OPEN returns error %d", error); + vput(*vpp); + return error; + } + + VOP_UNLOCK(*vpp, 0); + return 0; } static void -vdev_disk_rele(vdev_t *vd) +vdev_disk_detach(vnode_t *vp) { - ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + ZFS_LOG(2, "call vn_close(vp %p)", vp); + vn_close(vp, FREAD|FWRITE, kcred); +} +/* XXX mostly from rf_netbsdkintf.c:raidread_component_area() */ +static int +vdev_disk_read_blocks(struct vnode *b_vp, void *data, daddr_t offset, daddr_t size) +{ + size_t resid; + int error; + + ZFS_LOG(4, "vnode %p, data %p, offset %"PRIu64", size %"PRIu64, + b_vp, data, offset, size); + error = vn_rdwr(UIO_READ, b_vp, data, size, offset, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, &resid); + + if (error) + return error; + if (resid != 0) + return EIO; + + return 0; } +/* + * XXX Modified from vdev_geom.c:vdev_geom_read_config() + * Mostly identical to vdev_geom_read_config except for block reading + * code, should be refactored to share common code. We can add a + * NetBSD stub for issuing IO similar to vdev_geom_io() to reduce + * differences even more. + */ + +/* + * Read the vdev config from a device. Return the number of valid labels that + * were found. The vdev config will be returned in config if and only if at + * least one valid label was found. + */ +static int +vdev_disk_read_config(struct vnode *vp, const char *cname, nvlist_t **configp) +{ + vdev_phys_t *vdev_lists[VDEV_LABELS]; + off_t offsets[VDEV_LABELS]; + off_t sizes[VDEV_LABELS]; + int errors[VDEV_LABELS]; + + nvlist_t *config; + struct buf *bp; + void *buf; + size_t buflen; + off_t size, psize; + uint64_t state, txg, numsecs; + unsigned int secsize; + int error, l, nlabels; + + ZFS_LOG(2, "called"); + error = getdisksize(vp, &numsecs, &secsize); + if (error) + return (error); + ZFS_LOG(2, "numsecs = %"PRIu64, numsecs); + ZFS_LOG(2, "secsize = %u", secsize); + if (UINT64_MAX / numsecs < secsize) + return EOVERFLOW; + ZFS_LOG(2, "MAXBSIZE = %d", MAXBSIZE); + ZFS_LOG(1, "Reading config from %s...", cname); + + psize = numsecs * secsize; + ZFS_LOG(3, "full partition size = %"PRIu64, psize); + psize = P2ALIGN_TYPED(psize, sizeof(vdev_label_t), uint64_t); + ZFS_LOG(3, "rounded partition size = %"PRIu64, psize); + + size = sizeof(*vdev_lists[0]) + secsize - + ((sizeof(*vdev_lists[0]) - 1) % secsize) - 1; + + buflen = sizeof(vdev_lists[0]->vp_nvlist); + + /* Read all the labels */ + for (l = 0; l < VDEV_LABELS; l++) { + vdev_lists[l] = kmem_alloc(size, KM_SLEEP); + offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + ZFS_LOG(3, "label %d offset = %"PRIu64, l, offsets[l]); + errors[l] = vdev_disk_read_blocks(vp, vdev_lists[l], offsets[l], size); + } + + /* Parse the labels */ + config = *configp = NULL; + nlabels = 0; + for (l = 0; l < VDEV_LABELS; l++) { + if (errors[l] != 0) + continue; + + buf = vdev_lists[l]->vp_nvlist; + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) { + ZFS_LOG(3, "nvlist unpack error"); + continue; + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + ZFS_LOG(3, "found state %"PRIu64" BAD", state); + nvlist_free(config); + continue; + } + ZFS_LOG(3, "found state %"PRIu64, state); + + if (state != POOL_STATE_SPARE && + state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(config); + continue; + } + ZFS_LOG(3, "found txg = %"PRIu64, txg); + + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + nlabels++; + } + + /* Free the label storage */ + for (l = 0; l < VDEV_LABELS; l++) + kmem_free(vdev_lists[l], size); + + ZFS_LOG(3, "return %d", nlabels); + return (nlabels); +} + +/* XXX direct from vdev_geom.c */ static void -vdev_disk_flush(struct work *work, void *cookie) +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) +{ + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_zalloc((id + 1) * sizeof(nvlist_t *), + KM_SLEEP); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof(void *)); + *configs = new_configs; + *count = id + 1; +} + +/* XXX direct from vdev_geom.c but with lots of ZFS_LOG() */ +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t* known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid, known_guid; + uint64_t id, txg, known_txg; + char *pname; + int i; + + /* XXXSB investigate a problem with a not-used mirror */ + + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) { + ZFS_LOG(3, "name mismatch \"%s\"", pname); + goto ignore; + } + ZFS_LOG(3, "found pool name \"%s\"", pname); + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; + ZFS_LOG(3, "found pool guid %"PRIu64, pool_guid); + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; + ZFS_LOG(3, "found vdev guid %"PRIu64, vdev_guid); + + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + ZFS_LOG(3, "found pool config id %"PRIu64, id); + + VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + ZFS_LOG(3, "found txg %"PRIu64,txg); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + ZFS_LOG(3, "found known pool guid"); + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + VERIFY(nvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + /* XXXSB do we leak these nvlist configs? */ + (*configs)[id] = cfg; + ZFS_LOG(3, "found"); + return; + +ignore: + ZFS_LOG(3, "not found"); + nvlist_free(cfg); +} + +/* + * Given the root disk device devid or pathname, read the label from + * the device, and construct a configuration nvlist. + */ +int +vdev_disk_read_pool_label(const char *poolname, nvlist_t ***configs, + uint64_t *count) +{ + struct vnode *vp; + device_t dv; + deviter_t di; + dev_t dev; + nvlist_t *vdev_cfg; + int i, error, nlabels, bmajor, bminor; + uint64_t pool_guid; + bool wedge; + + ZFS_LOG(3, "poolname = \"%s\"", poolname); + + pool_guid = 0; + /* + * below from + * sys/dev/raidframe/rf_netbsdkintf.c:rf_find_raid_components() + */ + + for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; + dv = deviter_next(&di)) { + if (!device_is_eligible_for_vdev(dv)) + continue; + + ZFS_LOG(2, "found wedge \"%s\"", device_xname(dv)); + + /* need to find the device_name_to_block_device_major stuff */ + bmajor = devsw_name2blk(device_xname(dv), NULL, 0); + bminor = minor(device_unit(dv)); + if (device_has_partitions(dv)) { + dev = MAKEDISKDEV(bmajor, bminor, RAW_PART); + } else { + dev = makedev(bmajor, bminor); + } + + ZFS_LOG(2, "dev = %s, major = %d, minor = %d, dev = %#"PRIx64, + device_xname(dv), bmajor, bminor, dev); + + /* get a vnode for the raw partition of this disk */ + error = vdev_disk_attach_dev(dev, &vp); + if (error) { + /* Continue looking for something that exists */ + ZFS_LOG(2, "vdev_disk_attach_dev(\"%s\") " + "returns error %d", device_xname(dv), error); + continue; + } + + /* if a wedge, validate wedge type is zfs */ + /* XXX wedge specific knowledge! */ + if (device_is_a(dv, "dk")) { + struct dkwedge_info dkw; + error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, + NOCRED); + if (error) { + printf("zfs: can't get wedge info for " + "dev %s (%d)\n", device_xname(dv), error); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); // XXXSB correct? + vdev_disk_detach(vp); + continue; + } + + ZFS_LOG(2, "wedge type \"%s\"", dkw.dkw_ptype); + if (strcmp(dkw.dkw_ptype, DKW_PTYPE_ZFS) != 0) { + vdev_disk_detach(vp); + continue; + } + } + + nlabels = vdev_disk_read_config(vp, device_xname(dv), + &vdev_cfg); + + if (nlabels > 0) { + ZFS_LOG(1, "successfully read vdev config"); + + process_vdev_config(configs, count, vdev_cfg, + poolname, &pool_guid); + } + + /* don't need this any more. We'll allocate it again + a little later if we really do... */ + vdev_disk_detach(vp); + } + deviter_release(&di); + + ZFS_LOG(2, "not found, return ENOENT"); + return (*count > 0 ? 0 : ENOENT); +} + +enum match { + NO_MATCH = 0, /* No matching labels found */ + TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/ + ZERO_MATCH = 1, /* Should never be returned */ + ONE_MATCH = 2, /* 1 label matching the vdev_guid */ + TWO_MATCH = 3, /* 2 label matching the vdev_guid */ + THREE_MATCH = 4, /* 3 label matching the vdev_guid */ + FULL_MATCH = 5 /* all labels match the vdev_guid */ + +}; + +static enum match +vdev_attach_ok(vdev_t *vd, vnode_t *vp) +{ + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + int error, nlabels; + + ZFS_LOG(2, "vd = %p, vp = %p, dev = %#"PRIx64, vd, vp, vp->v_rdev); + ZFS_LOG(2, "vdev_path =\"%s\"", vd->vdev_path); + ZFS_LOG(2, "vdev_devid =\"%s\"", vd->vdev_devid); + nlabels = vdev_disk_read_config(vp, vd->vdev_path, &config); + if (nlabels == 0) { + ZFS_LOG(1, "Unable to read config from %s.", vd->vdev_path); + return (NO_MATCH); + } + + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. + */ + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for device %s: %ju != %ju.", + vd->vdev_devid, (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)pool_guid); + return (NO_MATCH); + } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for device %s.", vd->vdev_devid); + return (ZERO_MATCH + nlabels); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for device %s.", vd->vdev_devid); + return (TOPGUID_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for device %s: %ju != %ju.", + vd->vdev_devid, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); +} + +static enum match +vdev_attach_ok_path(vdev_t *vd) { - vdev_disk_t *dvd; - int error, cmd; - buf_t *bp; vnode_t *vp; + enum match m; + int error; - bp = (struct buf *)work; - vp = bp->b_vp; - dvd = cookie; + ZFS_LOG(2, "path = \"%s\"", vd->vdev_path); + error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0, &vp, + CRCREAT, 0); - KASSERT(vp == dvd->vd_vp); + if (error) { + ZFS_LOG(2, "vn_open returns error %d", error); + return NO_MATCH; + } - cmd = 1; - error = VOP_IOCTL(vp, DIOCCACHESYNC, &cmd, FREAD|FWRITE, kcred); - bp->b_error = error; - vdev_disk_io_intr(bp); + m = vdev_attach_ok(vd, vp); + + vn_close(vp, FREAD|FWRITE, kcred); + + return m; +} + + +/* XXXSB so much code duplication with vdev_disk_read_pool_label() below */ +static int +vdev_disk_attach_by_guids(vdev_t *vd, vnode_t **vpp) +{ + device_t dv, best_dv; + deviter_t di; + dev_t dev, best_dev; + int i, error, bmajor, bminor; + enum match match, best_match; + + ZFS_LOG(2, "called"); + *vpp = NULL; + best_dev = NODEV; + best_match = NO_MATCH; + best_dv = NULL; + for (dv = deviter_first(&di, DEVITER_F_ROOT_FIRST); dv != NULL; + dv = deviter_next(&di)) { + if (!device_is_eligible_for_vdev(dv)) + continue; + + ZFS_LOG(2, "found device \"%s\"", device_xname(dv)); + + /* (ab)use vdev_devid for keeping track of this device name */ + vd->vdev_devid = __UNCONST(device_xname(dv)); + + /* need to find the device_name_to_block_device_major stuff */ + bmajor = devsw_name2blk(device_xname(dv), NULL, 0); + bminor = minor(device_unit(dv)); + if (device_has_partitions(dv)) { + dev = MAKEDISKDEV(bmajor, bminor, RAW_PART); + } else { + dev = makedev(bmajor, bminor); + } + + /* XXXSB fix logic to follow vdev_geom_attach_by_guids? */ + /* XXXSB keep open vp for last best match? */ + + ZFS_LOG(2, "dev = %s, major = %d, minor = %d, dev = %#"PRIx64, + device_xname(dv), bmajor, bminor, dev); + + /* get a vnode for the raw partition of this disk */ + if (vdev_disk_attach_dev(dev, vpp) != 0) { + /* "Who cares." Continue looking for something that exists */ + continue; + } + + /* if a wedge, validate wedge type is zfs */ + /* XXX wedge specific knowledge! */ + if (device_is_a(dv, "dk")) { + struct dkwedge_info dkw; + error = VOP_IOCTL(*vpp, DIOCGWEDGEINFO, &dkw, FREAD, + NOCRED); + if (error) { + printf("zfs: can't get wedge info for " + "dev %s (%d)\n", device_xname(dv), error); + vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); // XXXSB correct? + vdev_disk_detach(*vpp); + continue; + } + + ZFS_LOG(2, "wedge type \"%s\"", dkw.dkw_ptype); + if (strcmp(dkw.dkw_ptype, DKW_PTYPE_ZFS) != 0) { + vdev_disk_detach(*vpp); + continue; + } + } + + match = vdev_attach_ok(vd, *vpp); + if (match > best_match) { + best_match = match; + best_dev = dev; + best_dv = dv; + } + if (match == FULL_MATCH) { + /* + * We don't know if got to here when + * we're at the out: label below, so + * always close here even though we'll + * reopen straight away. + */ + vdev_disk_detach(*vpp); + goto out; + } + vdev_disk_detach(*vpp); + } +out: + deviter_release(&di); + if (best_dev != NODEV) { + ZFS_LOG(2, "best device \"%s\"", device_xname(best_dv)); + error = vdev_disk_attach_dev(best_dev, vpp); + if (error) { + printf("ZFS WARNING: Unable to attach to %s.\n", + device_xname(dv)); + return error; + } + return 0; + } else { + ZFS_LOG(2, "not found, return EINVAL"); + return EINVAL; + } +} + +static int +vdev_disk_open_by_guids(vdev_t *vd, vnode_t **vpp) +{ + char *buf; + size_t len; + int error; + + ZFS_LOG(1, "Searching by guids [%ju:%ju].", + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); + error = vdev_disk_attach_by_guids(vd, vpp); + if (error == 0) { + len = strlen(vd->vdev_devid) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", vd->vdev_devid); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid, vd->vdev_path); + } else { + ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid); + } + + return error; +} + +static int +vdev_disk_open_by_path(vdev_t *vd, bool check_guid, vnode_t **vpp) +{ + vnode_t *vp; + int error; + + ZFS_LOG(2, "path = \"%s\", check_guid = %s", vd->vdev_path, + check_guid ? "true" : "false"); + error = ENOENT; + if (!check_guid || vdev_attach_ok_path(vd) == FULL_MATCH) + error = vdev_disk_attach_path(vd, vpp); + + return error; } static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift, uint64_t *pashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; @@ -151,13 +732,23 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi struct dkwedge_info dkw; struct disk_sectoralign dsa; + ZFS_LOG(2, "called"); /* * We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + vd->vdev_devid = NULL; /* finish device name helper (ab)use */ + ZFS_LOG(2, "invalid path"); return (SET_ERROR(EINVAL)); } + ZFS_LOG(2, "vdev_path = \"%s\"", vd->vdev_path); + + if (vd->vdev_devid == NULL) { + /* (ab)use vdev_devid for keeping track of this device name */ + vd->vdev_devid = vd->vdev_path; + ZFS_LOG(2, "use vdev_devid = vdev_path = \"%s\"", vd->vdev_devid); + } /* * Reopen the device if it's not currently open. Otherwise, @@ -192,6 +783,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. + * + * XXXSB investigate, is this still valid? especially with the + * vdev_devid (ab)use i added above. */ if (vd->vdev_devid != NULL) { /* XXXNETBSD wedges */ @@ -204,12 +798,51 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi #endif } - error = EINVAL; /* presume failure */ + if (vd->vdev_spa->spa_splitting_newspa || + ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && + (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { + /* + * We are dealing with a vdev that hasn't been previously + * opened (since boot), and we are not loading an + * existing pool configuration. This looks like a + * vdev add operation to a new or existing pool. + * Assume the user knows what he/she is doing and find + * GEOM provider by its name, ignoring GUID mismatches. + * + * XXXSB GEOM comment to be updated. + * + * XXPOLICY: It would be safer to only allow a device + * that is unlabeled or labeled but missing + * GUID information to be opened in this fashion, + * unless we are doing a split, in which case we + * should allow any guid. + */ + error = vdev_disk_open_by_path(vd, false, &vp); + ZFS_LOG(2, "vdev_disk_open_by_path error = %d", error); + } else { + /* + * Try using the recorded path for this device, but only + * accept it if its label data contains the expected GUIDs. + */ + error = vdev_disk_open_by_path(vd, true, &vp); + ZFS_LOG(2, "vdev_disk_open_by_path error = %d", error); + if (error) { + /* + * The device at vd->vdev_path doesn't have the + * expected GUIDs. The disks might have merely + * moved around so try all other devices + * to find one with the right GUIDs. + */ + error = vdev_disk_open_by_guids(vd, &vp); + ZFS_LOG(2, "vdev_disk_open_by_guids error = %d", error); + } + } - error = vn_open(vd->vdev_path, UIO_SYSSPACE, FREAD|FWRITE, 0, - &vp, CRCREAT, 0); if (error != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vd->vdev_devid = NULL; /* finish device name helper (ab)use */ + ZFS_LOG(2, "return error %d", error); return (SET_ERROR(error)); } if (vp->v_type != VBLK) { @@ -219,6 +852,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi vrele(vp); #endif vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vd->vdev_devid = NULL; /* finish device name helper (ab)use */ + ZFS_LOG(2, "return EINVAL"); return (SET_ERROR(EINVAL)); } @@ -254,6 +889,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psi #else vrele(vp); #endif + vd->vdev_devid = NULL; /* finish device name helper (ab)use */ + ZFS_LOG(2, "return error %d", error); return (SET_ERROR(error)); } @@ -263,42 +900,47 @@ skip_open: error = getdisksize(vp, &numsecs, &secsize); if (error != 0) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vd->vdev_devid = NULL; /* finish device name helper (ab)use */ + ZFS_LOG(2, "return error %d", error); return (SET_ERROR(error)); } - *psize = numsecs * secsize; - *max_psize = *psize; - - *ashift = highbit(MAX(secsize, SPA_MINBLOCKSIZE)) - 1; + /* + * Determine the actual size of the device. + */ + *max_psize = *psize = numsecs * secsize; /* - * Try to determine whether the disk has a preferred physical - * sector size even if it can emulate a smaller logical sector - * size with r/m/w cycles, e.g. a disk with 4096-byte sectors - * that for compatibility claims to support 512-byte ones. + * Determine the device's minimum transfer size and preferred + * transfer size. */ + *logical_ashift = highbit(MAX(secsize, SPA_MINBLOCKSIZE)) - 1; if (VOP_IOCTL(vp, DIOCGSECTORALIGN, &dsa, FREAD, NOCRED) == 0) { - *pashift = highbit(dsa.dsa_alignment * secsize) - 1; + *physical_ashift = highbit(dsa.dsa_alignment * secsize) - 1; if (dsa.dsa_firstaligned % dsa.dsa_alignment) printf("ZFS WARNING: vdev %s: sectors are misaligned" " (alignment=%"PRIu32", firstaligned=%"PRIu32")\n", vd->vdev_path, dsa.dsa_alignment, dsa.dsa_firstaligned); } else { - *pashift = *ashift; + /* XXX: freebsd doesn't set physical_ashift if not known */ + *physical_ashift = *logical_ashift; } vd->vdev_wholedisk = 0; + /* XXX: this test is only correct for wedges... */ if (getdiskinfo(vp, &dkw) != 0 && dkw.dkw_offset == 0 && dkw.dkw_size == numsecs) vd->vdev_wholedisk = 1, /* - * Clear the nowritecache bit, so that on a vdev_reopen() we will - * try again. + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. */ vd->vdev_nowritecache = B_FALSE; + vd->vdev_devid = NULL; /* finish device name helper (ab)use */ + ZFS_LOG(2, "return success 0"); return (0); } @@ -307,6 +949,7 @@ vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; + ZFS_LOG(1, "vdev = %p, dvd = %p", vd, dvd); if (vd->vdev_reopening || dvd == NULL) return; @@ -352,6 +995,7 @@ vdev_disk_close(vdev_t *vd) vdev_disk_free(vd); } +#if 0 /* XXX unused? */ int vdev_disk_physio(vdev_t *vd, caddr_t data, size_t size, uint64_t offset, int flags, boolean_t isdump) @@ -384,6 +1028,7 @@ vdev_disk_physio(vdev_t *vd, caddr_t dat return (EIO); #endif } +#endif /* XXX unused? */ static void vdev_disk_io_intr(buf_t *bp) @@ -404,6 +1049,7 @@ vdev_disk_io_intr(buf_t *bp) zio_delay_interrupt(zio); } +#if 0 /* XXX unused? */ static void vdev_disk_ioctl_free(zio_t *zio) { @@ -424,6 +1070,7 @@ vdev_disk_ioctl_done(void *zio_arg, int zio_interrupt(zio); } +#endif /* XXX unused? */ static void vdev_disk_io_start(zio_t *zio) @@ -561,6 +1208,25 @@ vdev_disk_io_done(zio_t *zio) #endif } +/* + * It's not clear what these hold/rele functions are supposed to do. + */ +static void +vdev_disk_hold(vdev_t *vd) +{ + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + +} + vdev_ops_t vdev_disk_ops = { vdev_disk_open, vdev_disk_close, @@ -573,86 +1239,3 @@ vdev_ops_t vdev_disk_ops = { VDEV_TYPE_DISK, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; - -/* - * Given the root disk device devid or pathname, read the label from - * the device, and construct a configuration nvlist. - */ -int -vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) -{ -#ifdef __NetBSD__ - return (ENOTSUP); -#else - ldi_handle_t vd_lh; - vdev_label_t *label; - uint64_t s, size; - int l; - ddi_devid_t tmpdevid; - int error = -1; - char *minor_name; - - /* - * Read the device label and build the nvlist. - */ - if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, - &minor_name) == 0) { - error = ldi_open_by_devid(tmpdevid, minor_name, - FREAD, kcred, &vd_lh, zfs_li); - ddi_devid_free(tmpdevid); - ddi_devid_str_free(minor_name); - } - - if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, - zfs_li))) - return (error); - - if (ldi_get_size(vd_lh, &s)) { - (void) ldi_close(vd_lh, FREAD, kcred); - return (SET_ERROR(EIO)); - } - - size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); - label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); - - *config = NULL; - for (l = 0; l < VDEV_LABELS; l++) { - uint64_t offset, state, txg = 0; - - /* read vdev label */ - offset = vdev_label_offset(size, l, 0); - if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, - VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) - continue; - - if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, - sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { - *config = NULL; - continue; - } - - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || state >= POOL_STATE_DESTROYED) { - nvlist_free(*config); - *config = NULL; - continue; - } - - if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, - &txg) != 0 || txg == 0) { - nvlist_free(*config); - *config = NULL; - continue; - } - - break; - } - - kmem_free(label, sizeof (vdev_label_t)); - (void) ldi_close(vd_lh, FREAD, kcred); - if (*config == NULL) - error = SET_ERROR(EIDRM); - - return (error); -#endif -}