From 25ed7e5924fd6e97b17831d2b42ecb3975cd71f2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 28 Sep 2009 14:40:59 -0700 Subject: imsm: cleanup disk status tests Add is_failed(), is_configured(), and is_spare() helpers to clean up disk status flag testing. Signed-off-by: Dan Williams --- super-intel.c | 53 +++++++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/super-intel.c b/super-intel.c index 07b0b90..9f57a68 100644 --- a/super-intel.c +++ b/super-intel.c @@ -612,6 +612,21 @@ static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) } #ifndef MDASSEMBLE +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + static void print_imsm_dev(struct imsm_dev *dev, char *uuid, int disk_idx) { __u64 sz; @@ -676,7 +691,6 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) { struct imsm_disk *disk = __get_imsm_disk(mpb, index); char str[MAX_RAID_SERIAL_LEN + 1]; - __u32 s; __u64 sz; if (index < 0) @@ -685,10 +699,9 @@ static void print_imsm_disk(struct imsm_super *mpb, int index, __u32 reserved) printf("\n"); snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); printf(" Disk%02d Serial : %s\n", index, str); - s = disk->status; - printf(" State :%s%s%s\n", s&SPARE_DISK ? " spare" : "", - s&CONFIGURED_DISK ? " active" : "", - s&FAILED_DISK ? " failed" : ""); + printf(" State :%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : ""); printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); sz = __le32_to_cpu(disk->total_blocks) - reserved; printf(" Usable Size : %llu%s\n", (unsigned long long)sz, @@ -1298,7 +1311,6 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) { struct intel_super *super = st->sb; struct imsm_disk *disk; - __u32 s; if (super->current_vol >= 0) { getinfo_super_imsm_volume(st, info); @@ -1334,14 +1346,13 @@ static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info) disk = &super->disks->disk; info->data_offset = __le32_to_cpu(disk->total_blocks) - reserved; info->component_size = reserved; - s = disk->status; - info->disk.state = s & CONFIGURED_DISK ? (1 << MD_DISK_ACTIVE) : 0; + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; /* we don't change info->disk.raid_disk here because * this state will be finalized in mdmon after we have * found the 'most fresh' version of the metadata */ - info->disk.state |= s & FAILED_DISK ? (1 << MD_DISK_FAULTY) : 0; - info->disk.state |= s & SPARE_DISK ? 0 : (1 << MD_DISK_SYNC); + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); } /* only call uuid_from_super_imsm when this disk is part of a populated container, @@ -3444,7 +3455,6 @@ static struct mdinfo *container_content_imsm(struct supertype *st) struct dl *d; int idx; int skip; - __u32 s; __u32 ord; skip = 0; @@ -3456,9 +3466,7 @@ static struct mdinfo *container_content_imsm(struct supertype *st) if (d == NULL) skip = 1; - - s = d ? d->disk.status : 0; - if (s & FAILED_DISK) + if (d && is_failed(&d->disk)) skip = 1; if (ord & IMSM_ORD_REBUILD) skip = 1; @@ -3565,8 +3573,7 @@ static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, insync = 2; disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) insync--; /* no in-sync disks left in this mirror the @@ -3616,8 +3623,7 @@ static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev) idx = ord_to_idx(ord); disk = get_imsm_disk(super, idx); - if (!disk || disk->status & FAILED_DISK || - ord & IMSM_ORD_REBUILD) + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) failed++; } @@ -3676,7 +3682,7 @@ static int mark_failure(struct imsm_dev *dev, struct imsm_disk *disk, int idx) return 0; ord = __le32_to_cpu(map->disk_ord_tbl[slot]); - if ((disk->status & FAILED_DISK) && (ord & IMSM_ORD_REBUILD)) + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) return 0; disk->status |= FAILED_DISK; @@ -3877,7 +3883,7 @@ static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_a if (dl->index == i) break; - if (dl && dl->disk.status & FAILED_DISK) + if (dl && is_failed(&dl->disk)) dl = NULL; if (dl) @@ -3915,11 +3921,10 @@ static struct dl *imsm_add_spare(struct intel_super *super, int slot, continue; /* skip in use or failed drives */ - if (dl->disk.status & FAILED_DISK || idx == dl->index || + if (is_failed(&dl->disk) || idx == dl->index || dl->index == -2) { dprintf("%x:%x status (failed: %d index: %d)\n", - dl->major, dl->minor, - (dl->disk.status & FAILED_DISK) == FAILED_DISK, idx); + dl->major, dl->minor, is_failed(&dl->disk), idx); continue; } @@ -4221,7 +4226,7 @@ static void imsm_process_update(struct supertype *st, if (i == u->slot) continue; disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i)); - if (!disk || disk->status & FAILED_DISK) + if (!disk || is_failed(disk)) failed++; } -- cgit v1.2.1 From 51725a7c2569b764f59f009bc0ef42901a1ec915 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Sep 2009 11:44:38 -0700 Subject: imsm: kill close() of component device None of the other formats close the passed in fd at load, and this becomes a problem when trying to support --update where we need O_EXCL protection across the entire operation. Signed-off-by: Dan Williams --- super-intel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/super-intel.c b/super-intel.c index 9f57a68..80cd6c5 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1685,10 +1685,8 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) serialcpy(dl->serial, serial); dl->index = -2; dl->e = NULL; - } else if (keep_fd) { - close(dl->fd); + } else if (keep_fd) dl->fd = fd; - } /* look up this disk's index in the current anchor */ for (i = 0; i < super->anchor->num_disks; i++) { -- cgit v1.2.1 From a2b9798159755b6f5e867fae0dd3e25af59fc85e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 30 Sep 2009 11:45:41 -0700 Subject: imsm: disambiguate family_num This is a result of trawling through the Windows implementation to learn the mechanism of how it disambiguates family_num. It is a continuation of commit 148acb7b "imsm: fix family number handling" which introduced a regression when reassembling a container with stale disks and rebuilt members. When rebuilding, a new family number is assigned to protect against the "prodigal array member" problem. It prevents a former family member from returning to the system and causing a rebuild to go the wrong direction. However, this invalidates looking at the generation number to determine the most up-to-date disk when comparing across family numbers. Instead the assembly logic looks for agreement between a disk's local family membership compared against a global list of all families in the system. Whenever a disk's local metadata does not match a family number on the global list that family number is marked offline. It is possible that this logic results in multiple incompatible but valid family numbers existing in a container. In this case mdadm.conf cannot be consulted because it only records the uuid which is generated from static fields in the metadata. The metadata lacks the data needed to disambiguate "local" versus "foreign". The "foreign" array in this case requires updating to change its container-id information (orig_family_num), and possibly the member array names. Signed-off-by: Dan Williams --- super-intel.c | 580 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 448 insertions(+), 132 deletions(-) diff --git a/super-intel.c b/super-intel.c index 80cd6c5..e53afbb 100644 --- a/super-intel.c +++ b/super-intel.c @@ -265,6 +265,14 @@ struct intel_super { struct bbm_log *bbm_log; const char *hba; /* device path of the raid controller for this metadata */ const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; }; struct extent { @@ -1477,8 +1485,19 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) */ if (first->anchor->num_raid_devs > 0 && sec->anchor->num_raid_devs > 0) { - if (first->anchor->orig_family_num != sec->anchor->orig_family_num || - first->anchor->family_num != sec->anchor->family_num) + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) return 3; } @@ -1548,7 +1567,6 @@ static void fd2devname(int fd, char *name) snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); } - extern int scsi_get_serial(int fd, void *buf, size_t buf_len); static int imsm_read_serial(int fd, char *devname, @@ -1642,14 +1660,32 @@ static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) return dl; } +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} + static int load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) { + struct imsm_disk *disk; struct dl *dl; struct stat stb; int rv; - int i; - int alloc = 1; + char name[40]; __u8 serial[MAX_RAID_SERIAL_LEN]; rv = imsm_read_serial(fd, devname, serial); @@ -1657,16 +1693,7 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) if (rv != 0) return 2; - /* check if this is a disk we have seen before. it may be a spare in - * super->disks while the current anchor believes it is a raid member, - * check if we need to update dl->index - */ - dl = serial_to_dl(serial, super); - if (!dl) - dl = malloc(sizeof(*dl)); - else - alloc = 0; - + dl = calloc(1, sizeof(*dl)); if (!dl) { if (devname) fprintf(stderr, @@ -1675,51 +1702,35 @@ load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) return 2; } - if (alloc) { - fstat(fd, &stb); - dl->major = major(stb.st_rdev); - dl->minor = minor(stb.st_rdev); - dl->next = super->disks; - dl->fd = keep_fd ? fd : -1; - dl->devname = devname ? strdup(devname) : NULL; - serialcpy(dl->serial, serial); - dl->index = -2; - dl->e = NULL; - } else if (keep_fd) - dl->fd = fd; + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = strdup(devname); + else + dl->devname = strdup(name); /* look up this disk's index in the current anchor */ - for (i = 0; i < super->anchor->num_disks; i++) { - struct imsm_disk *disk_iter; - - disk_iter = __get_imsm_disk(super->anchor, i); - - if (serialcmp(disk_iter->serial, dl->serial) == 0) { - dl->disk = *disk_iter; - /* only set index on disks that are a member of a - * populated contianer, i.e. one with raid_devs - */ - if (dl->disk.status & FAILED_DISK) - dl->index = -2; - else if (dl->disk.status & SPARE_DISK) - dl->index = -1; - else - dl->index = i; - - break; - } - } - - /* no match, maybe a stale failed drive */ - if (i == super->anchor->num_disks && dl->index >= 0) { - dl->disk = *__get_imsm_disk(super->anchor, dl->index); - if (dl->disk.status & FAILED_DISK) + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; } - if (alloc) - super->disks = dl; - return 0; } @@ -1861,7 +1872,6 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) struct stat; struct imsm_super *anchor; __u32 check_sum; - int rc; get_dev_size(fd, NULL, &dsize); @@ -1923,10 +1933,7 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) return 2; } - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); - return rc; + return 0; } /* read the extended mpb */ @@ -1962,11 +1969,23 @@ static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) */ super->bbm_log = __get_imsm_bbm_log(super->anchor); - rc = load_imsm_disk(fd, super, devname, 0); - if (rc == 0) - rc = parse_raid_devices(super); + return 0; +} + +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); - return rc; + return err; } static void __free_imsm_disk(struct dl *d) @@ -2096,19 +2115,333 @@ static int find_missing(struct intel_super *super) return 0; } +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("%s: mpb from %d:%d matches %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("%s: mpb from %d:%d replaces %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("%s: mpb from %d:%d prefer %d:%d\n", + __func__, super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = calloc(1, sizeof(*idisk)); + if (!idisk) + return -1; + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("%s: '%.16s' owner %d != %d\n", + __func__, disk->serial, idisk->owner, + owner); + } else { + dprintf("%s: unknown disk %x [%d]: %.16s\n", + __func__, __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + fprintf(stderr, "Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("%s: marking family: %#x from %d:%d offline\n", + __func__, mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + fprintf(stderr, "Chose family %#x on '%s', " + "assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, char *devname, int keep_fd) { struct mdinfo *sra; - struct intel_super *super; - struct mdinfo *sd, *best = NULL; - __u32 bestgen = 0; - __u32 gen; - char nm[20]; - int dfd; - int rv; + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; int devnum = fd2devnum(fd); + struct mdinfo *sd; int retry; + int err = 0; + int i; enum sysfs_read_flags flags; flags = GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE; @@ -2125,81 +2458,51 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, strcmp(sra->text_version, "imsm") != 0) return 1; - super = alloc_super(0); - if (!super) - return 1; + /* load all mpbs */ + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + struct intel_super *s = alloc_super(0); + char nm[20]; + int dfd; + + err = 1; + if (!s) + goto error; + s->next = super_list; + super_list = s; - /* find the most up to date disk in this array, skipping spares */ - for (sd = sra->devs; sd; sd = sd->next) { + err = 2; sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); dfd = dev_open(nm, keep_fd ? O_RDWR : O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 2; - } - rv = load_imsm_mpb(dfd, super, NULL); + if (dfd < 0) + goto error; + + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); /* retry the load if we might have raced against mdmon */ - if (rv == 3 && mdmon_running(devnum)) + if (err == 3 && mdmon_running(devnum)) for (retry = 0; retry < 3; retry++) { usleep(3000); - rv = load_imsm_mpb(dfd, super, NULL); - if (rv != 3) + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) break; } if (!keep_fd) close(dfd); - if (rv == 0) { - if (super->anchor->num_raid_devs == 0) - gen = 0; - else - gen = __le32_to_cpu(super->anchor->generation_num); - if (!best || gen > bestgen) { - bestgen = gen; - best = sd; - } - } else { - free_imsm(super); - return rv; - } - } - - if (!best) { - free_imsm(super); - return 1; - } - - /* load the most up to date anchor */ - sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); - dfd = dev_open(nm, O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 1; - } - rv = load_imsm_mpb(dfd, super, NULL); - close(dfd); - if (rv != 0) { - free_imsm(super); - return 2; + if (err) + goto error; } - /* re-parse the disk list with the current anchor */ - for (sd = sra->devs ; sd ; sd = sd->next) { - sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); - dfd = dev_open(nm, keep_fd? O_RDWR : O_RDONLY); - if (dfd < 0) { - free_imsm(super); - return 2; - } - load_imsm_disk(dfd, super, NULL, keep_fd); - if (!keep_fd) - close(dfd); + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; } - if (find_missing(super) != 0) { free_imsm(super); - return 2; + err = 2; + goto error; } if (st->subarray[0]) { @@ -2207,13 +2510,26 @@ static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, super->current_vol = atoi(st->subarray); else { free_imsm(super); - return 1; + err = 1; + goto error; } } + err = 0; + + error: + while (super_list) { + struct intel_super *s = super_list; + + super_list = super_list->next; + free_imsm(s); + } + + if (err) + return err; *sbp = super; st->container_dev = devnum; - if (st->ss == NULL) { + if (err == 0 && st->ss == NULL) { st->ss = &super_imsm; st->minor_version = 0; st->max_devs = IMSM_MAX_DEVICES; @@ -2244,7 +2560,7 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname) return 1; } - rv = load_imsm_mpb(fd, super, devname); + rv = load_and_parse_mpb(fd, super, devname, 0); if (rv) { if (devname) -- cgit v1.2.1 From f796af5d5ea603085ce6bcf3c171b89a1f84f37a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: imsm: fix spare record writeout race imsm_activate_spare() in the manager thread may race against write_super_imsm_spares() in the monitor thread. Give write_super_imsm_spares() its own private mpb buffer to prevent confusing the manager. This change uncovered cases where spares were not being assembled due to a failed metadata version number check. Spares can freely associate across metadata version number, so reduce the scope of the version check in the spare assembly case. Signed-off-by: Dan Williams --- super-intel.c | 59 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/super-intel.c b/super-intel.c index e53afbb..0e3ed89 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1477,9 +1477,6 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) return 0; } - if (memcmp(first->anchor->sig, sec->anchor->sig, MAX_SIGNATURE_LENGTH) != 0) - return 3; - /* if an anchor does not have num_raid_devs set then it is a free * floating spare */ @@ -1492,6 +1489,10 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) __u32 first_family = first->anchor->orig_family_num; __u32 sec_family = sec->anchor->orig_family_num; + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) + return 3; + if (first_family == 0) first_family = first->anchor->family_num; if (sec_family == 0) @@ -1499,8 +1500,10 @@ static int compare_super_imsm(struct supertype *st, struct supertype *tst) if (first_family != sec_family) return 3; + } + /* if 'first' is a spare promote it to a populated mpb with sec's * family number */ @@ -2976,39 +2979,48 @@ static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, return 0; } -static int store_imsm_mpb(int fd, struct intel_super *super); +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[512]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(512))); /* spare records have their own family number and do not have any defined raid * devices */ static int write_super_imsm_spares(struct intel_super *super, int doclose) { - struct imsm_super mpb_save; struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; __u32 sum; struct dl *d; - mpb_save = *mpb; - mpb->num_raid_devs = 0; - mpb->num_disks = 1; - mpb->mpb_size = sizeof(struct imsm_super); - mpb->generation_num = __cpu_to_le32(1UL); + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)), + spare->generation_num = __cpu_to_le32(1UL), + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1, + spare->num_raid_devs = 0, + spare->cache_size = mpb->cache_size, + spare->pwr_cycle_count = __cpu_to_le32(1), + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); for (d = super->disks; d; d = d->next) { if (d->index != -1) continue; - mpb->disk[0] = d->disk; - sum = __gen_imsm_checksum(mpb); - mpb->family_num = __cpu_to_le32(sum); - mpb->orig_family_num = 0; - sum = __gen_imsm_checksum(mpb); - mpb->check_sum = __cpu_to_le32(sum); + spare->disk[0] = d->disk; + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); - if (store_imsm_mpb(d->fd, super)) { + if (store_imsm_mpb(d->fd, spare)) { fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); - *mpb = mpb_save; return 1; } if (doclose) { @@ -3017,7 +3029,6 @@ static int write_super_imsm_spares(struct intel_super *super, int doclose) } } - *mpb = mpb_save; return 0; } @@ -3069,7 +3080,7 @@ static int write_super_imsm(struct intel_super *super, int doclose) for (d = super->disks; d ; d = d->next) { if (d->index < 0) continue; - if (store_imsm_mpb(d->fd, super)) + if (store_imsm_mpb(d->fd, mpb)) fprintf(stderr, "%s: failed for device %d:%d %s\n", __func__, d->major, d->minor, strerror(errno)); if (doclose) { @@ -4144,9 +4155,9 @@ static void imsm_set_disk(struct active_array *a, int n, int state) } } -static int store_imsm_mpb(int fd, struct intel_super *super) +static int store_imsm_mpb(int fd, struct imsm_super *mpb) { - struct imsm_super *mpb = super->anchor; + void *buf = mpb; __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); unsigned long long dsize; unsigned long long sectors; @@ -4161,7 +4172,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * (2 + sectors)), SEEK_SET) < 0) return 1; - if (write(fd, super->buf + 512, 512 * sectors) != 512 * sectors) + if (write(fd, buf + 512, 512 * sectors) != 512 * sectors) return 1; } @@ -4169,7 +4180,7 @@ static int store_imsm_mpb(int fd, struct intel_super *super) if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) return 1; - if (write(fd, super->buf, 512) != 512) + if (write(fd, buf, 512) != 512) return 1; return 0; -- cgit v1.2.1 From e683ca88ac4c2f55059e8c82aff7a487a0884ef7 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: imsm: fix/support --update Fix init_super_imsm() to return an empty mpb when info == NULL, and teach store_super_imsm() to simply write out the passed in mpb. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=523320 Reported-by: Hans de Goede Signed-off-by: Dan Williams --- super-intel.c | 54 ++++++++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/super-intel.c b/super-intel.c index 0e3ed89..eaf5b0b 100644 --- a/super-intel.c +++ b/super-intel.c @@ -2819,24 +2819,33 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, size_t mpb_size; char *version; - if (!info) { - st->sb = NULL; - return 0; - } if (st->sb) - return init_super_imsm_volume(st, info, size, name, homehost, - uuid); + return init_super_imsm_volume(st, info, size, name, homehost, uuid); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = 512; super = alloc_super(1); - if (!super) - return 0; - mpb_size = disks_to_mpb_size(info->nr_disks); - if (posix_memalign(&super->buf, 512, mpb_size) != 0) { + if (super && posix_memalign(&super->buf, 512, mpb_size) != 0) { free(super); + super = NULL; + } + if (!super) { + fprintf(stderr, Name + ": %s could not allocate superblock\n", __func__); return 0; } + memset(super->buf, 0, mpb_size); mpb = super->buf; - memset(mpb, 0, mpb_size); + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; @@ -2844,9 +2853,7 @@ static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, strcpy(version, MPB_SIGNATURE); version += strlen(MPB_SIGNATURE); strcpy(version, MPB_VERSION_RAID0); - mpb->mpb_size = mpb_size; - st->sb = super; return 1; } @@ -3188,24 +3195,15 @@ static int write_init_super_imsm(struct supertype *st) } #endif -static int store_zero_imsm(struct supertype *st, int fd) +static int store_super_imsm(struct supertype *st, int fd) { - unsigned long long dsize; - void *buf; - - get_dev_size(fd, NULL, &dsize); - - /* first block is stored on second to last sector of the disk */ - if (lseek64(fd, dsize - (512 * 2), SEEK_SET) < 0) - return 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; - if (posix_memalign(&buf, 512, 512) != 0) + if (!mpb) return 1; - memset(buf, 0, 512); - if (write(fd, buf, 512) != 512) - return 1; - return 0; + return store_imsm_mpb(fd, mpb); } static int imsm_bbm_log_size(struct imsm_super *mpb) @@ -4914,7 +4912,7 @@ struct superswitch super_imsm = { .load_super = load_super_imsm, .init_super = init_super_imsm, - .store_super = store_zero_imsm, + .store_super = store_super_imsm, .free_super = free_super_imsm, .match_metadata_desc = match_metadata_desc_imsm, .container_content = container_content_imsm, -- cgit v1.2.1 From 955e9ea1394662f097a88bb3d62c56ab50448597 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: ddf: prevent superblock being zeroed on --update The full fix would be to support updating ddf metadata, but this minimal fix just prevents the superblock from being zeroed when someone inadvertently passes an unsupported --update option during assembly. Reported-by: Hans de Goede Signed-off-by: Dan Williams --- super-ddf.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/super-ddf.c b/super-ddf.c index 9bf08c2..06858e2 100644 --- a/super-ddf.c +++ b/super-ddf.c @@ -1589,13 +1589,8 @@ static int init_super_ddf(struct supertype *st, struct phys_disk *pd; struct virtual_disk *vd; - if (!info) { - st->sb = NULL; - return 0; - } if (st->sb) - return init_super_ddf_bvd(st, info, size, name, homehost, - uuid); + return init_super_ddf_bvd(st, info, size, name, homehost, uuid); if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { fprintf(stderr, Name ": %s could not allocate superblock\n", __func__); @@ -1604,6 +1599,12 @@ static int init_super_ddf(struct supertype *st, memset(ddf, 0, sizeof(*ddf)); ddf->dlist = NULL; /* no physical disks yet */ ddf->conflist = NULL; /* No virtual disks yet */ + st->sb = ddf; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } /* At least 32MB *must* be reserved for the ddf. So let's just * start 32MB from the end, and put the primary header there. @@ -2971,12 +2972,22 @@ static struct mdinfo *container_content_ddf(struct supertype *st) return rest; } -static int store_zero_ddf(struct supertype *st, int fd) +static int store_super_ddf(struct supertype *st, int fd) { + struct ddf_super *ddf = st->sb; unsigned long long dsize; void *buf; int rc; + if (!ddf) + return 1; + + /* ->dlist and ->conflist will be set for updates, currently not + * supported + */ + if (ddf->dlist || ddf->conflist) + return 1; + if (!get_dev_size(fd, NULL, &dsize)) return 1; @@ -3627,7 +3638,7 @@ struct superswitch super_ddf = { .load_super = load_super_ddf, .init_super = init_super_ddf, - .store_super = store_zero_ddf, + .store_super = store_super_ddf, .free_super = free_super_ddf, .match_metadata_desc = match_metadata_desc_ddf, .container_content = container_content_ddf, -- cgit v1.2.1 From 6e46bf344bf34a688696e240596f8259e328eea9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: imsm: add --update=uuid support When disks have conflicting container memberships (same container ids but incompatible member arrays) --update=uuid can be used to move offenders to a new container id by changing 'orig_family_num'. Note that this only supports random updates of the uuid as the actual uuid is synthesized. We also need to communicate the new 'orig_family_num' value to all disks involved in the update. A new field 'update_private' is added to struct mdinfo to allow this information to be transmitted. Signed-off-by: Dan Williams --- Assemble.c | 3 +++ mdadm.h | 5 +++++ super-intel.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/Assemble.c b/Assemble.c index 4578906..7da0905 100644 --- a/Assemble.c +++ b/Assemble.c @@ -565,6 +565,7 @@ int Assemble(struct supertype *st, char *mddev, #endif /* Ok, no bad inconsistancy, we can try updating etc */ bitmap_done = 0; + content->update_private = NULL; for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) if (tmpdev->used == 1) { char *devname = tmpdev->devname; struct stat stb; @@ -717,6 +718,8 @@ int Assemble(struct supertype *st, char *mddev, } devcnt++; } + free(content->update_private); + content->update_private = NULL; if (devcnt == 0) { fprintf(stderr, Name ": no devices found for %s\n", diff --git a/mdadm.h b/mdadm.h index 91ba624..04b87b8 100644 --- a/mdadm.h +++ b/mdadm.h @@ -153,6 +153,11 @@ struct mdinfo { int cache_size; /* size of raid456 stripe cache*/ int mismatch_cnt; char text_version[50]; + void *update_private; /* for passing metadata-format + * specific update data + * between successive calls to + * update_super() + */ int container_member; /* for assembling external-metatdata arrays * This is to be used internally by metadata diff --git a/super-intel.c b/super-intel.c index eaf5b0b..110c4a8 100644 --- a/super-intel.c +++ b/super-intel.c @@ -1378,8 +1378,6 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info, char *update, char *devname, int verbose, int uuid_set, char *homehost) { - /* FIXME */ - /* For 'assemble' and 'force' we need to return non-zero if any * change was made. For others, the return value is ignored. * Update options are: @@ -1395,26 +1393,55 @@ static int update_super_imsm(struct supertype *st, struct mdinfo *info, * linear only * resync: mark as dirty so a resync will happen. * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given * * Following are not relevant for this imsm: * sparc2.2 : update from old dodgey metadata * super-minor: change the preferred_minor number * summaries: update redundant counters. - * uuid: Change the uuid of the array to match watch is given * homehost: update the recorded homehost * _reshape_progress: record new reshape_progress position. */ - int rv = 0; - //struct intel_super *super = st->sb; - //struct imsm_super *mpb = super->mpb; + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; - if (strcmp(update, "grow") == 0) { - } - if (strcmp(update, "resync") == 0) { - /* dev->vol.dirty = 1; */ - } + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; + + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0 && uuid_set && !info->update_private) + fprintf(stderr, + Name ": '--uuid' not supported for imsm metadata\n"); + else if (strcmp(update, "uuid") == 0 && uuid_set && info->update_private) { + mpb->orig_family_num = *((__u32 *) info->update_private); + rv = 0; + } else if (strcmp(update, "uuid") == 0) { + __u32 *new_family = malloc(sizeof(*new_family)); + + /* update orig_family_number with the incoming random + * data, report the new effective uuid, and store the + * new orig_family_num for future updates. + */ + if (new_family) { + memcpy(&mpb->orig_family_num, info->uuid, sizeof(__u32)); + uuid_from_super_imsm(st, info->uuid); + *new_family = mpb->orig_family_num; + info->update_private = new_family; + rv = 0; + } + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + fprintf(stderr, + Name ": '--update=%s' not supported for imsm metadata\n", + update); - /* IMSM has no concept of UUID or homehost */ + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); return rv; } -- cgit v1.2.1 From d2b9eb5993b6c36bf1d66980811bda1b6eefb19f Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:53 -0700 Subject: imsm: regression test for prodigal array member scenario Provide a test to sanity check assembly and reassembly in the presence of conflicting family number information. Signed-off-by: Dan Williams --- tests/09imsm-assemble | 46 ++++++++++++++++++++++++++++++++++++++++++++++ tests/env-09imsm-assemble | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 tests/09imsm-assemble create mode 100644 tests/env-09imsm-assemble diff --git a/tests/09imsm-assemble b/tests/09imsm-assemble new file mode 100644 index 0000000..7389b0e --- /dev/null +++ b/tests/09imsm-assemble @@ -0,0 +1,46 @@ +# validate the prodigal member disk scenario i.e. a former container +# member is returned after having been rebuilt on another system +num_disks=4 +size=$((10*1024)) +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size +mdadm --wait $member +mdadm -Ss + +# make dev0 and dev1 a new rebuild family +mdadm -A $container $dev0 $dev1 +mdadm -I $container +mdadm --wait ${member}_0 +mdadm -Ss + +# make dev2 and dev3 a new rebuild family +mdadm -A $container $dev2 $dev3 +mdadm -I $container +mdadm --wait ${member}_0 +mdadm -Ss + +# reassemble and make sure one of the families falls out +mdadm -A $container $dev0 $dev1 $dev2 $dev3 +mdadm -I $container +testdev ${member}_0 1 $size 1 +if mdadm --remove $container $dev0 ; then + # the dev[23] family won + imsm_check_removal $container $dev1 + imsm_check_hold $container $dev2 + imsm_check_hold $container $dev3 +else + # the dev[01] family won + imsm_check_hold $container $dev1 + imsm_check_removal $container $dev2 + imsm_check_removal $container $dev3 +fi +mdadm -Ss + +# reassemble with a new id for the dev[23] family +mdadm -A $container $dev0 $dev1 +mdadm -I $container +mdadm -A ${container}2 $dev2 $dev3 --update=uuid +mdadm -I ${container}2 + +testdev ${member}_0 1 $size 1 +testdev ${member}_1 1 $size 1 diff --git a/tests/env-09imsm-assemble b/tests/env-09imsm-assemble new file mode 100644 index 0000000..b12954b --- /dev/null +++ b/tests/env-09imsm-assemble @@ -0,0 +1,32 @@ +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +setup_env() { + export IMSM_DEVNAME_AS_SERIAL=1 + export IMSM_TEST_OROM=1 + container=/dev/md/container + member=/dev/md/vol0 +} + +reset_env() { + unset IMSM_DEVNAME_AS_SERIAL + unset IMSM_TEST_OROM + unset imsm_check + unset container + unset member +} -- cgit v1.2.1 From aae5a11207cf6da1682e6a76e116a19e21473f03 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:57 -0700 Subject: Detail: export MD_UUID from mapfile The load_super() from an mdadm --detail call may race against an mdmon update. When this happens the load_super sees an inconsistent metadata block and returns an error. The fallback path to use the map file contents lacks uuid reporting, so provide __fname_from_uuid for generically printing a uuid. Reported-by: Hans de Goede Signed-off-by: Dan Williams --- Detail.c | 5 +++++ mdadm.h | 1 + util.c | 12 ++++++++---- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Detail.c b/Detail.c index 001012a..1598a42 100644 --- a/Detail.c +++ b/Detail.c @@ -194,7 +194,12 @@ int Detail(char *dev, int brief, int export, int test, char *homehost) st->ss->export_detail_super(st); } else { struct map_ent *mp, *map = NULL; + char nbuf[64]; mp = map_by_devnum(&map, fd2devnum(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } if (mp && mp->path && strncmp(mp->path, "/dev/md/", 8) == 0) printf("MD_DEVNAME=%s\n", mp->path+8); diff --git a/mdadm.h b/mdadm.h index 04b87b8..8212a2c 100644 --- a/mdadm.h +++ b/mdadm.h @@ -810,6 +810,7 @@ extern void uuid_from_super(int uuid[4], mdp_super_t *super); extern const int uuid_match_any[4]; extern int same_uuid(int a[4], int b[4], int swapuuid); extern void copy_uuid(void *a, int b[4], int swapuuid); +extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep); extern char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep); extern unsigned long calc_csum(void *super, int bytes); diff --git a/util.c b/util.c index 4ccb1bb..98aedd0 100644 --- a/util.c +++ b/util.c @@ -269,17 +269,15 @@ void copy_uuid(void *a, int b[4], int swapuuid) memcpy(a, b, 16); } -char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) { int i, j; - int id; char uuid[16]; char *c = buf; strcpy(c, "UUID-"); c += strlen(c); - copy_uuid(uuid, info->uuid, st->ss->swapuuid); + copy_uuid(uuid, id, swap); for (i = 0; i < 4; i++) { - id = uuid[i]; if (i) *c++ = sep; for (j = 3; j >= 0; j--) { @@ -288,6 +286,12 @@ char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char } } return buf; + +} + +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, char *buf, char sep) +{ + return __fname_from_uuid(info->uuid, st->ss->swapuuid, buf, sep); } #ifndef MDASSEMBLE -- cgit v1.2.1 From 96a8270d46faab599b41f1cf78b4331b44c5a6be Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:41:57 -0700 Subject: mdmon: avoid writes in the startup path for mdmon on root arrays When killing a previous monitor be careful not to cause writes to the filesystem until the reads necessary to get the monitor operational have completed. The code is already prepared for errors creating the pid and socket files, so simply defer creation of these files until after the first call to manage(). Cc: Hans de Goede Signed-off-by: Dan Williams --- managemon.c | 6 +++++ mdmon.c | 78 +++++++++++++++++++++++++------------------------------------ 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/managemon.c b/managemon.c index f9d545d..5958e18 100644 --- a/managemon.c +++ b/managemon.c @@ -680,6 +680,12 @@ void do_manager(struct supertype *container) read_sock(container); if (container->sock < 0 || socket_hup_requested) { + /* If this fails, we hope it already exists + * pid file lives in /var/run/mdadm/mdXX.pid + */ + mkdir("/var", 0600); + mkdir("/var/run", 0600); + mkdir("/var/run/mdadm", 0600); close(container->sock); container->sock = make_control_sock(container->devname); make_pidfile(container->devname, 0); diff --git a/mdmon.c b/mdmon.c index 31994d8..5f87e78 100644 --- a/mdmon.c +++ b/mdmon.c @@ -113,6 +113,14 @@ static struct superswitch *find_metadata_methods(char *vers) return NULL; } +static int test_pidfile(char *devname) +{ + char path[100]; + struct stat st; + + sprintf(path, "/var/run/mdadm/%s.pid", devname); + return stat(path, &st); +} int make_pidfile(char *devname, int o_excl) { @@ -149,26 +157,29 @@ int is_container_member(struct mdstat_ent *mdstat, char *container) return 1; } -void remove_pidfile(char *devname); -static void try_kill_monitor(char *devname) +pid_t devname2mdmon(char *devname) { char buf[100]; + pid_t pid = -1; int fd; - pid_t pid; - struct mdstat_ent *mdstat; sprintf(buf, "/var/run/mdadm/%s.pid", devname); - fd = open(buf, O_RDONLY); + fd = open(buf, O_RDONLY|O_NOATIME); if (fd < 0) - return; - - if (read(fd, buf, sizeof(buf)) < 0) { - close(fd); - return; - } + return -1; + if (read(fd, buf, sizeof(buf)) > 0) + sscanf(buf, "%d\n", &pid); close(fd); - pid = strtoul(buf, NULL, 10); + + return pid; +} + +static void try_kill_monitor(pid_t pid, char *devname) +{ + char buf[100]; + int fd; + struct mdstat_ent *mdstat; /* first rule of survival... don't off yourself */ if (pid == getpid()) @@ -197,7 +208,6 @@ static void try_kill_monitor(char *devname) WaitClean(buf, 0); } free_mdstat(mdstat); - remove_pidfile(devname); } void remove_pidfile(char *devname) @@ -355,6 +365,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) int pfd[2]; int status; int ignore; + pid_t victim = -1; dprintf("starting mdmon for %s in %s\n", devname, switchroot ? : "/"); @@ -400,6 +411,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) container->devname = devname; container->arrays = NULL; container->subarray[0] = 0; + container->sock = -1; if (!container->devname) { fprintf(stderr, "mdmon: failed to allocate container name string\n"); @@ -464,12 +476,9 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) if (switchroot) { /* we assume we assume that /sys /proc /dev are available in - * the new root (see nash:setuproot) - * - * kill any monitors in the current namespace and change - * to the new one + * the new root */ - try_kill_monitor(container->devname); + victim = devname2mdmon(container->devname); if (chroot(switchroot) != 0) { fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n", switchroot, strerror(errno)); @@ -477,40 +486,15 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) } } - /* If this fails, we hope it already exists - * pid file lives in /var/run/mdadm/mdXX.pid - */ - mkdir("/var", 0600); - mkdir("/var/run", 0600); - mkdir("/var/run/mdadm", 0600); ignore = chdir("/"); - if (make_pidfile(container->devname, O_EXCL) < 0) { + if (victim < 0 && test_pidfile(container->devname) == 0) { if (ping_monitor(container->devname) == 0) { fprintf(stderr, "mdmon: %s already managed\n", container->devname); exit(3); - } else { - int err; - - /* cleanup the old monitor, this one is taking over */ - try_kill_monitor(container->devname); - err = make_pidfile(container->devname, 0); - if (err < 0) { - fprintf(stderr, "mdmon: %s Cannot create pidfile\n", - container->devname); - if (err == -EROFS) { - /* FIXME implement a mechanism to - * prevent duplicate monitor instances - */ - fprintf(stderr, - "mdmon: continuing on read-only file system\n"); - } else - exit(3); - } - } + } else if (victim < 0) + victim = devname2mdmon(container->devname); } - container->sock = make_control_sock(container->devname); - if (container->ss->load_super(container, mdfd, devname)) { fprintf(stderr, "mdmon: Cannot load metadata for %s\n", devname); @@ -544,6 +528,8 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) exit(2); } + if (victim > -1) + try_kill_monitor(victim, container->devname); do_manager(container); exit(0); -- cgit v1.2.1 From b928b5a0384e7181425a282a0586cbbb3c85fbc3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:08:33 -0700 Subject: mdmon: exec(2) when the switchroot argument is not "/" Try to execute mdmon from the target namespace. When used for initramfs handovers we need to drop all references to the initramfs filesystem for that memory to be freed. Cc: Hans de Goede Signed-off-by: Dan Williams --- mdmon.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mdmon.c b/mdmon.c index 5f87e78..d3e8be5 100644 --- a/mdmon.c +++ b/mdmon.c @@ -369,6 +369,29 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) dprintf("starting mdmon for %s in %s\n", devname, switchroot ? : "/"); + + /* try to spawn mdmon instances from the target file system */ + if (switchroot && strcmp(switchroot, "/") != 0) { + char path[1024]; + pid_t pid; + + sprintf(path, "%s/sbin/mdmon", switchroot); + switch (fork()) { + case 0: + execl(path, "mdmon", devname, NULL); + exit(1); + case -1: + return 1; + default: + pid = wait(&status); + if (pid > -1 && WIFEXITED(status) && + WEXITSTATUS(status) == 0) + return 0; + else + return 1; + } + } + mdfd = open_dev(devnum); if (mdfd < 0) { fprintf(stderr, "mdmon: %s: %s\n", devname, -- cgit v1.2.1 From 9f1da8242161ba684f2867f211eb7e9d4baa84bb Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 13 Oct 2009 17:37:02 -0700 Subject: mdmon: preserve socket over chroot Connect to the monitor in the old namespace and use that connection for WaitClean requests when stopping the victim mdmon instance. This allows ping_monitor() to work post chroot(). Cc: Hans de Goede Signed-off-by: Dan Williams --- mdadm.c | 4 ++-- mdadm.h | 2 +- mdmon.c | 12 ++++++++---- msg.c | 14 +++++++++++--- msg.h | 1 + sysfs.c | 5 +++-- 6 files changed, 26 insertions(+), 12 deletions(-) diff --git a/mdadm.c b/mdadm.c index bb3e5bb..6f43dc3 100644 --- a/mdadm.c +++ b/mdadm.c @@ -1276,7 +1276,7 @@ int main(int argc, char *argv[]) export, test, homehost); else - rv |= WaitClean(name, v); + rv |= WaitClean(name, -1, v); put_md_name(name); } free_mdstat(ms); @@ -1337,7 +1337,7 @@ int main(int argc, char *argv[]) case 'W': rv |= Wait(dv->devname); continue; case Waitclean: - rv |= WaitClean(dv->devname, verbose-quiet); continue; + rv |= WaitClean(dv->devname, -1, verbose-quiet); continue; } mdfd = open_mddev(dv->devname, 1); if (mdfd>=0) { diff --git a/mdadm.h b/mdadm.h index 8212a2c..ffa5f53 100644 --- a/mdadm.h +++ b/mdadm.h @@ -753,7 +753,7 @@ extern int Monitor(mddev_dev_t devlist, extern int Kill(char *dev, int force, int quiet, int noexcl); extern int Wait(char *dev); -extern int WaitClean(char *dev, int verbose); +extern int WaitClean(char *dev, int sock, int verbose); extern int Incremental(char *devname, int verbose, int runstop, struct supertype *st, char *homehost, int require_homehost, diff --git a/mdmon.c b/mdmon.c index d3e8be5..50c7be6 100644 --- a/mdmon.c +++ b/mdmon.c @@ -175,7 +175,7 @@ pid_t devname2mdmon(char *devname) return pid; } -static void try_kill_monitor(pid_t pid, char *devname) +static void try_kill_monitor(pid_t pid, char *devname, int sock) { char buf[100]; int fd; @@ -205,7 +205,7 @@ static void try_kill_monitor(pid_t pid, char *devname) for ( ; mdstat; mdstat = mdstat->next) if (is_container_member(mdstat, devname)) { sprintf(buf, "/dev/%s", mdstat->dev); - WaitClean(buf, 0); + WaitClean(buf, sock, 0); } free_mdstat(mdstat); } @@ -366,6 +366,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) int status; int ignore; pid_t victim = -1; + int victim_sock = -1; dprintf("starting mdmon for %s in %s\n", devname, switchroot ? : "/"); @@ -502,6 +503,7 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) * the new root */ victim = devname2mdmon(container->devname); + victim_sock = connect_monitor(container->devname); if (chroot(switchroot) != 0) { fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n", switchroot, strerror(errno)); @@ -551,8 +553,10 @@ int mdmon(char *devname, int devnum, int scan, char *switchroot) exit(2); } - if (victim > -1) - try_kill_monitor(victim, container->devname); + if (victim > -1) { + try_kill_monitor(victim, container->devname, victim_sock); + close(victim_sock); + } do_manager(container); exit(0); diff --git a/msg.c b/msg.c index 5a4839f..8d52b94 100644 --- a/msg.c +++ b/msg.c @@ -177,10 +177,8 @@ int connect_monitor(char *devname) return sfd; } -/* give the monitor a chance to update the metadata */ -int ping_monitor(char *devname) +int fping_monitor(int sfd) { - int sfd = connect_monitor(devname); int err = 0; if (sfd < 0) @@ -194,6 +192,16 @@ int ping_monitor(char *devname) if (!err && wait_reply(sfd, 20) != 0) err = -1; + return err; +} + + +/* give the monitor a chance to update the metadata */ +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err = fping_monitor(sfd); + close(sfd); return err; } diff --git a/msg.h b/msg.h index b9bd205..f8e89fd 100644 --- a/msg.h +++ b/msg.h @@ -27,6 +27,7 @@ extern int ack(int fd, int tmo); extern int wait_reply(int fd, int tmo); extern int connect_monitor(char *devname); extern int ping_monitor(char *devname); +extern int fping_monitor(int sock); extern int ping_manager(char *devname); #define MSG_MAX_LEN (4*1024*1024) diff --git a/sysfs.c b/sysfs.c index 81ccb53..d327e3d 100644 --- a/sysfs.c +++ b/sysfs.c @@ -764,7 +764,7 @@ int sysfs_unique_holder(int devnum, long rdev) static char *clean_states[] = { "clear", "inactive", "readonly", "read-auto", "clean", NULL }; -int WaitClean(char *dev, int verbose) +int WaitClean(char *dev, int sock, int verbose) { int fd; struct mdinfo *mdi; @@ -840,7 +840,8 @@ int WaitClean(char *dev, int verbose) } if (rv < 0) rv = 1; - else if (ping_monitor(mdi->text_version) == 0) { + else if (fping_monitor(sock) == 0 || + ping_monitor(mdi->text_version) == 0) { /* we need to ping to close the window between array * state transitioning to clean and the metadata being * marked clean -- cgit v1.2.1