#include #include "nvme.h" int __nvme_submit_sync_cmd(struct nvme_ctrl *ctrl, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, unsigned timeout, int qid) { return ctrl->ops->submit_sync_cmd(ctrl, cmd, result, buffer, bufflen, timeout, qid); } EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); int nvme_submit_sync_cmd(struct nvme_ctrl *ctrl, struct nvme_command *cmd, void *buffer, unsigned bufflen) { return __nvme_submit_sync_cmd(ctrl, cmd, NULL, buffer, bufflen, 0, NVME_QID_ADMIN); } EXPORT_SYMBOL_GPL(nvme_sec_submit); static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) { struct nvme_command c = { }; int error; /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ c.identify.opcode = nvme_admin_identify; c.identify.cns = NVME_ID_CNS_CTRL; *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); if (!*id) return -ENOMEM; error = nvme_submit_sync_cmd(dev, &c, *id, sizeof(struct nvme_id_ctrl)); if (error) kfree(*id); return error; } static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, void *buffer, size_t buflen, u32 *result) { struct nvme_command c; union nvme_result res; int ret; memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); ret = __nvme_submit_sync_cmd(dev, &c, &res, buffer, buflen, 0, NVME_QID_ADMIN); if (ret >= 0 && result) *result = le32_to_cpu(res.u32); return ret; } int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) { u32 q_count = (*count - 1) | ((*count - 1) << 16); u32 result; int status, nr_io_queues; status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, &result); if (status < 0) return status; /* * Degraded controllers might return an error when setting the queue * count. We still want to be able to bring them online and offer * access to the admin queue, as that might be only way to fix them up. */ if (status > 0) { dev_err(ctrl->dev, "Could not set queue count (%d)\n", status); *count = 0; } else { nr_io_queues = min(result & 0xffff, result >> 16) + 1; *count = min(*count, nr_io_queues); } return 0; } EXPORT_SYMBOL_GPL(nvme_set_queue_count); static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) { uint64_t start = get_time_ns(); unsigned long timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2); u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; int ret; while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { if (csts == ~0) return -ENODEV; if ((csts & NVME_CSTS_RDY) == bit) break; mdelay(100); if (is_timeout(start, timeout)) { dev_err(ctrl->dev, "Device not ready; aborting %s\n", enabled ? "initialisation" : "reset"); return -ENODEV; } } return ret; } static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) { struct nvme_command c = { }; c.identify.opcode = nvme_admin_identify; c.identify.cns = NVME_ID_CNS_NS_ACTIVE_LIST; c.identify.nsid = cpu_to_le32(nsid); return nvme_submit_sync_cmd(dev, &c, ns_list, NVME_IDENTIFY_DATA_SIZE); } static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_id_ns *id; struct nvme_command c = { }; int error; /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ c.identify.opcode = nvme_admin_identify; c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS; id = kmalloc(sizeof(*id), GFP_KERNEL); if (!id) return NULL; error = nvme_submit_sync_cmd(ctrl, &c, id, sizeof(*id)); if (error) { dev_warn(ctrl->dev, "Identify namespace failed\n"); kfree(id); return NULL; } return id; } static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_id_ns *id) { static int instance = 1; struct nvme_ns_head *head; int ret = -ENOMEM; head = kzalloc(sizeof(*head), GFP_KERNEL); if (!head) goto out; head->instance = instance++; head->ns_id = nsid; return head; out: return ERR_PTR(ret); } static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_id_ns *id) { struct nvme_ctrl *ctrl = ns->ctrl; const bool is_shared = id->nmic & (1 << 0); struct nvme_ns_head *head = NULL; if (is_shared) { dev_info(ctrl->dev, "Skipping shared namespace %u\n", nsid); return -ENOTSUPP; } head = nvme_alloc_ns_head(ctrl, nsid, id); if (IS_ERR(head)) return PTR_ERR(head); ns->head = head; return 0; } #define DISK_NAME_LEN 32 static void nvme_update_disk_info(struct block_device *blk, struct nvme_ns *ns, struct nvme_id_ns *id) { blk->blockbits = ns->lba_shift; blk->num_blocks = le64_to_cpup(&id->nsze); ns->readonly = id->nsattr & (1 << 0); } static void __nvme_revalidate_disk(struct block_device *blk, struct nvme_id_ns *id) { struct nvme_ns *ns = to_nvme_ns(blk); /* * If identify namespace failed, use default 512 byte block size so * block layer can use before failing read/write for 0 capacity. */ ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; nvme_update_disk_info(blk, ns, id); } static void nvme_setup_rw(struct nvme_ns *ns, struct nvme_command *cmnd, int block, int num_block) { cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, block)); cmnd->rw.length = cpu_to_le16(num_block - 1); cmnd->rw.control = 0; cmnd->rw.dsmgmt = 0; } static void nvme_setup_flush(struct nvme_ns *ns, struct nvme_command *cmnd) { memset(cmnd, 0, sizeof(*cmnd)); cmnd->common.opcode = nvme_cmd_flush; cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } static int nvme_submit_sync_rw(struct nvme_ns *ns, struct nvme_command *cmnd, void *buffer, int block, int num_blocks) { /* * ns->ctrl->max_hw_sectors is in units of 512 bytes, so we * need to make sure we adjust it to discovered lba_shift */ const u32 max_hw_sectors = ns->ctrl->max_hw_sectors >> (ns->lba_shift - 9); int ret; if (num_blocks > max_hw_sectors) { while (num_blocks) { const int chunk = min_t(int, num_blocks, max_hw_sectors); ret = nvme_submit_sync_rw(ns, cmnd, buffer, block, chunk); if (ret) break; num_blocks -= chunk; buffer += chunk; block += chunk; } return ret; } nvme_setup_rw(ns, cmnd, block, num_blocks); ret = __nvme_submit_sync_cmd(ns->ctrl, cmnd, NULL, buffer, num_blocks << ns->lba_shift, 0, NVME_QID_IO); if (ret) { dev_err(ns->ctrl->dev, "I/O failed: block: %d, num blocks: %d, status code type: %xh, status code %02xh\n", block, num_blocks, (ret >> 8) & 0xf, ret & 0xff); return -EIO; } return 0; } static int nvme_block_device_read(struct block_device *blk, void *buffer, int block, int num_blocks) { struct nvme_ns *ns = to_nvme_ns(blk); struct nvme_command cmnd = { }; cmnd.rw.opcode = nvme_cmd_read; return nvme_submit_sync_rw(ns, &cmnd, buffer, block, num_blocks); } static int __maybe_unused nvme_block_device_write(struct block_device *blk, const void *buffer, int block, int num_blocks) { struct nvme_ns *ns = to_nvme_ns(blk); struct nvme_command cmnd = { }; if (ns->readonly) return -EINVAL; cmnd.rw.opcode = nvme_cmd_write; return nvme_submit_sync_rw(ns, &cmnd, (void *)buffer, block, num_blocks); } static int __maybe_unused nvme_block_device_flush(struct block_device *blk) { struct nvme_ns *ns = to_nvme_ns(blk); struct nvme_command cmnd = { }; nvme_setup_flush(ns, &cmnd); return __nvme_submit_sync_cmd(ns->ctrl, &cmnd, NULL, NULL, 0, 0, NVME_QID_IO); } static struct block_device_ops nvme_block_device_ops = { .read = nvme_block_device_read, #ifdef CONFIG_BLOCK_WRITE .write = nvme_block_device_write, .flush = nvme_block_device_flush, #endif }; static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; int ret, flags; ns = kzalloc(sizeof(*ns), GFP_KERNEL); if (!ns) return; ns->ctrl = ctrl; ns->lba_shift = 9; /* set to a default value for 512 until * disk is validated */ id = nvme_identify_ns(ctrl, nsid); if (!id) goto out_free_ns; if (id->ncap == 0) goto out_free_id; if (nvme_init_ns_head(ns, nsid, id)) goto out_free_id; nvme_set_disk_name(disk_name, ns, ctrl, &flags); ns->blk.dev = ctrl->dev; ns->blk.ops = &nvme_block_device_ops; ns->blk.cdev.name = strdup(disk_name); __nvme_revalidate_disk(&ns->blk, id); kfree(id); ret = blockdevice_register(&ns->blk); if (ret) { dev_err(ctrl->dev, "Cannot register block device (%d)\n", ret); goto out_free_id; } return; out_free_id: kfree(id); out_free_ns: kfree(ns); } static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) { __le32 *ns_list; unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); int ret = 0; ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); if (!ns_list) return -ENOMEM; for (i = 0; i < num_lists; i++) { ret = nvme_identify_ns_list(ctrl, prev, ns_list); if (ret) goto out; for (j = 0; j < min(nn, 1024U); j++) { nsid = le32_to_cpu(ns_list[j]); if (!nsid) goto out; nvme_alloc_ns(ctrl, nsid); } nn -= j; } out: kfree(ns_list); return ret; } static void nvme_scan_ns_sequential(struct nvme_ctrl *ctrl, unsigned nn) { unsigned i; for (i = 1; i <= nn; i++) nvme_alloc_ns(ctrl, i); } static void nvme_scan_work(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; unsigned nn; if (nvme_identify_ctrl(ctrl, &id)) return; nn = le32_to_cpu(id->nn); if (ctrl->vs >= NVME_VS(1, 1, 0)) { if (!nvme_scan_ns_list(ctrl, nn)) goto out_free_id; } nvme_scan_ns_sequential(ctrl, nn); out_free_id: kfree(id); } void nvme_start_ctrl(struct nvme_ctrl *ctrl) { if (ctrl->queue_count > 1) nvme_scan_work(ctrl); } EXPORT_SYMBOL_GPL(nvme_start_ctrl); /* * If the device has been passed off to us in an enabled state, just clear * the enabled bit. The spec says we should set the 'shutdown notification * bits', but doing so may cause the device to complete commands to the * admin queue ... and we don't know what memory that might be pointing at! */ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) { int ret; ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; ctrl->ctrl_config &= ~NVME_CC_ENABLE; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; return nvme_wait_ready(ctrl, cap, false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) { /* * Default to a 4K page size, with the intention to update this * path in the future to accomodate architectures with differing * kernel and IO page sizes. */ unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; int ret; if (page_shift < dev_page_min) { dev_err(ctrl->dev, "Minimum device page size %u too large for host (%u)\n", 1 << dev_page_min, 1 << page_shift); return -ENODEV; } ctrl->page_size = 1 << page_shift; ctrl->ctrl_config = NVME_CC_CSS_NVM; ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; ctrl->ctrl_config |= NVME_CC_ENABLE; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; return nvme_wait_ready(ctrl, cap, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) { uint64_t start = get_time_ns(); unsigned long timeout = SHUTDOWN_TIMEOUT; u32 csts; int ret; ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) break; mdelay(100); if (is_timeout(start, timeout)) { dev_err(ctrl->dev, "Device shutdown incomplete; abort shutdown\n"); return -ENODEV; } } return ret; } EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl); #define NVME_ID_MAX_LEN 41 static void nvme_print(struct nvme_ctrl *ctrl, const char *prefix, const char *_string, size_t _length) { char string[NVME_ID_MAX_LEN]; const size_t length = min(_length, sizeof(string) - 1); memcpy(string, _string, length); string[length - 1] = '\0'; dev_info(ctrl->dev, "%s: %s\n", prefix, string); } static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { nvme_print(ctrl, "serial", id->sn, sizeof(id->sn)); nvme_print(ctrl, "model", id->mn, sizeof(id->mn)); nvme_print(ctrl, "firmware", id->fr, sizeof(id->fr)); return 0; } /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as * the admin queue is fully up and running. */ int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; u64 cap; int ret, page_shift; u32 max_hw_sectors; ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); if (ret) { dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); return ret; } ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); if (ret) { dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); return ret; } page_shift = NVME_CAP_MPSMIN(cap) + 12; ret = nvme_identify_ctrl(ctrl, &id); if (ret) { dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); return -EIO; } ret = nvme_init_subsystem(ctrl, id); if (ret) return ret; if (id->mdts) max_hw_sectors = 1 << (id->mdts + page_shift - 9); else max_hw_sectors = UINT_MAX; ctrl->max_hw_sectors = min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); kfree(id); return 0; } EXPORT_SYMBOL_GPL(nvme_init_identify); /* * Initialize a NVMe controller structures. This needs to be called during * earliest initialization so that we have the initialized structured around * during probing. */ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device_d *dev, const struct nvme_ctrl_ops *ops) { static int instance = 0; ctrl->dev = dev; ctrl->ops = ops; ctrl->instance = instance++; return 0; } EXPORT_SYMBOL_GPL(nvme_init_ctrl);