summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-05-01 10:39:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2017-05-01 10:39:57 -0700
commit694752922b12bd318aa80191bd9d8c3dcfb39055 (patch)
tree5afe83fd99100bea546dd5a1c1f778c58f41e5c0
parenta351e9b9fc24e982ec2f0e76379a49826036da12 (diff)
parent9438b3e080beccf6022138ea62192d55cc7dc4ed (diff)
downloadlinux-0-day-694752922b12bd318aa80191bd9d8c3dcfb39055.tar.gz
linux-0-day-694752922b12bd318aa80191bd9d8c3dcfb39055.tar.xz
Merge branch 'for-4.12/block' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe: - Add BFQ IO scheduler under the new blk-mq scheduling framework. BFQ was initially a fork of CFQ, but subsequently changed to implement fairness based on B-WF2Q+, a modified variant of WF2Q. BFQ is meant to be used on desktop type single drives, providing good fairness. From Paolo. - Add Kyber IO scheduler. This is a full multiqueue aware scheduler, using a scalable token based algorithm that throttles IO based on live completion IO stats, similary to blk-wbt. From Omar. - A series from Jan, moving users to separately allocated backing devices. This continues the work of separating backing device life times, solving various problems with hot removal. - A series of updates for lightnvm, mostly from Javier. Includes a 'pblk' target that exposes an open channel SSD as a physical block device. - A series of fixes and improvements for nbd from Josef. - A series from Omar, removing queue sharing between devices on mostly legacy drivers. This helps us clean up other bits, if we know that a queue only has a single device backing. This has been overdue for more than a decade. - Fixes for the blk-stats, and improvements to unify the stats and user windows. This both improves blk-wbt, and enables other users to register a need to receive IO stats for a device. From Omar. - blk-throttle improvements from Shaohua. This provides a scalable framework for implementing scalable priotization - particularly for blk-mq, but applicable to any type of block device. The interface is marked experimental for now. - Bucketized IO stats for IO polling from Stephen Bates. This improves efficiency of polled workloads in the presence of mixed block size IO. - A few fixes for opal, from Scott. - A few pulls for NVMe, including a lot of fixes for NVMe-over-fabrics. From a variety of folks, mostly Sagi and James Smart. - A series from Bart, improving our exposed info and capabilities from the blk-mq debugfs support. - A series from Christoph, cleaning up how handle WRITE_ZEROES. - A series from Christoph, cleaning up the block layer handling of how we track errors in a request. On top of being a nice cleanup, it also shrinks the size of struct request a bit. - Removal of mg_disk and hd (sorry Linus) by Christoph. The former was never used by platforms, and the latter has outlived it's usefulness. - Various little bug fixes and cleanups from a wide variety of folks. * 'for-4.12/block' of git://git.kernel.dk/linux-block: (329 commits) block: hide badblocks attribute by default blk-mq: unify hctx delay_work and run_work block: add kblock_mod_delayed_work_on() blk-mq: unify hctx delayed_run_work and run_work nbd: fix use after free on module unload MAINTAINERS: bfq: Add Paolo as maintainer for the BFQ I/O scheduler blk-mq-sched: alloate reserved tags out of normal pool mtip32xx: use runtime tag to initialize command header scsi: Implement blk_mq_ops.show_rq() blk-mq: Add blk_mq_ops.show_rq() blk-mq: Show operation, cmd_flags and rq_flags names blk-mq: Make blk_flags_show() callers append a newline character blk-mq: Move the "state" debugfs attribute one level down blk-mq: Unregister debugfs attributes earlier blk-mq: Only unregister hctxs for which registration succeeded blk-mq-debugfs: Rename functions for registering and unregistering the mq directory blk-mq: Let blk_mq_debugfs_register() look up the queue name blk-mq: Register <dev>/queue/mq after having registered <dev>/queue ide-pm: always pass 0 error to ide_complete_rq in ide_do_devset ide-pm: always pass 0 error to __blk_end_request_all ..
-rw-r--r--Documentation/ABI/testing/sysfs-block10
-rw-r--r--Documentation/block/00-INDEX2
-rw-r--r--Documentation/block/bfq-iosched.txt531
-rw-r--r--Documentation/block/kyber-iosched.txt14
-rw-r--r--Documentation/block/queue-sysfs.txt11
-rw-r--r--Documentation/blockdev/mflash.txt84
-rw-r--r--Documentation/lightnvm/pblk.txt21
-rw-r--r--MAINTAINERS8
-rw-r--r--block/Kconfig12
-rw-r--r--block/Kconfig.iosched30
-rw-r--r--block/Makefile3
-rw-r--r--block/bfq-cgroup.c1139
-rw-r--r--block/bfq-iosched.c5047
-rw-r--r--block/bfq-iosched.h941
-rw-r--r--block/bfq-wf2q.c1616
-rw-r--r--block/bio.c19
-rw-r--r--block/blk-cgroup.c123
-rw-r--r--block/blk-core.c143
-rw-r--r--block/blk-exec.c11
-rw-r--r--block/blk-flush.c5
-rw-r--r--block/blk-integrity.c24
-rw-r--r--block/blk-lib.c78
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-debugfs.c331
-rw-r--r--block/blk-mq-pci.c2
-rw-r--r--block/blk-mq-sched.c103
-rw-r--r--block/blk-mq-sched.h18
-rw-r--r--block/blk-mq-sysfs.c61
-rw-r--r--block/blk-mq-tag.c5
-rw-r--r--block/blk-mq.c565
-rw-r--r--block/blk-mq.h16
-rw-r--r--block/blk-settings.c3
-rw-r--r--block/blk-stat.c323
-rw-r--r--block/blk-stat.h204
-rw-r--r--block/blk-sysfs.c82
-rw-r--r--block/blk-throttle.c985
-rw-r--r--block/blk-timeout.c1
-rw-r--r--block/blk-wbt.c95
-rw-r--r--block/blk-wbt.h16
-rw-r--r--block/blk.h15
-rw-r--r--block/bsg-lib.c8
-rw-r--r--block/bsg.c12
-rw-r--r--block/cfq-iosched.c17
-rw-r--r--block/compat_ioctl.c2
-rw-r--r--block/elevator.c3
-rw-r--r--block/genhd.c13
-rw-r--r--block/ioctl.c4
-rw-r--r--block/ioprio.c12
-rw-r--r--block/kyber-iosched.c719
-rw-r--r--block/partition-generic.c1
-rw-r--r--block/scsi_ioctl.c23
-rw-r--r--block/sed-opal.c153
-rw-r--r--block/t10-pi.c8
-rw-r--r--drivers/block/Kconfig47
-rw-r--r--drivers/block/Makefile3
-rw-r--r--drivers/block/ataflop.c12
-rw-r--r--drivers/block/brd.c54
-rw-r--r--drivers/block/cciss.c42
-rw-r--r--drivers/block/drbd/drbd_debugfs.c3
-rw-r--r--drivers/block/drbd/drbd_int.h6
-rw-r--r--drivers/block/drbd/drbd_main.c5
-rw-r--r--drivers/block/drbd/drbd_nl.c9
-rw-r--r--drivers/block/drbd/drbd_receiver.c105
-rw-r--r--drivers/block/drbd/drbd_req.c13
-rw-r--r--drivers/block/drbd/drbd_worker.c4
-rw-r--r--drivers/block/floppy.c10
-rw-r--r--drivers/block/hd.c803
-rw-r--r--drivers/block/loop.c38
-rw-r--r--drivers/block/loop.h1
-rw-r--r--drivers/block/mg_disk.c1112
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c58
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h1
-rw-r--r--drivers/block/nbd.c1417
-rw-r--r--drivers/block/null_blk.c22
-rw-r--r--drivers/block/osdblk.c693
-rw-r--r--drivers/block/paride/pcd.c57
-rw-r--r--drivers/block/paride/pd.c57
-rw-r--r--drivers/block/paride/pf.c57
-rw-r--r--drivers/block/pktcdvd.c2
-rw-r--r--drivers/block/rbd.c3
-rw-r--r--drivers/block/rsxx/dev.c1
-rw-r--r--drivers/block/swim.c55
-rw-r--r--drivers/block/swim3.c4
-rw-r--r--drivers/block/virtio_blk.c21
-rw-r--r--drivers/block/xen-blkfront.c41
-rw-r--r--drivers/block/zram/zram_drv.c13
-rw-r--r--drivers/cdrom/cdrom.c3
-rw-r--r--drivers/ide/ide-atapi.c11
-rw-r--r--drivers/ide/ide-cd.c21
-rw-r--r--drivers/ide/ide-cd_ioctl.c3
-rw-r--r--drivers/ide/ide-devsets.c8
-rw-r--r--drivers/ide/ide-disk.c3
-rw-r--r--drivers/ide/ide-dma.c2
-rw-r--r--drivers/ide/ide-eh.c36
-rw-r--r--drivers/ide/ide-floppy.c10
-rw-r--r--drivers/ide/ide-io.c10
-rw-r--r--drivers/ide/ide-ioctls.c7
-rw-r--r--drivers/ide/ide-park.c3
-rw-r--r--drivers/ide/ide-pm.c9
-rw-r--r--drivers/ide/ide-tape.c4
-rw-r--r--drivers/ide/ide-taskfile.c8
-rw-r--r--drivers/lightnvm/Kconfig9
-rw-r--r--drivers/lightnvm/Makefile5
-rw-r--r--drivers/lightnvm/core.c147
-rw-r--r--drivers/lightnvm/pblk-cache.c114
-rw-r--r--drivers/lightnvm/pblk-core.c1667
-rw-r--r--drivers/lightnvm/pblk-gc.c555
-rw-r--r--drivers/lightnvm/pblk-init.c962
-rw-r--r--drivers/lightnvm/pblk-map.c136
-rw-r--r--drivers/lightnvm/pblk-rb.c852
-rw-r--r--drivers/lightnvm/pblk-read.c529
-rw-r--r--drivers/lightnvm/pblk-recovery.c998
-rw-r--r--drivers/lightnvm/pblk-rl.c184
-rw-r--r--drivers/lightnvm/pblk-sysfs.c507
-rw-r--r--drivers/lightnvm/pblk-write.c414
-rw-r--r--drivers/lightnvm/pblk.h1121
-rw-r--r--drivers/lightnvm/rrpc.c25
-rw-r--r--drivers/md/dm-cache-target.c1
-rw-r--r--drivers/md/dm-core.h1
-rw-r--r--drivers/md/dm-crypt.c1
-rw-r--r--drivers/md/dm-io.c18
-rw-r--r--drivers/md/dm-kcopyd.c6
-rw-r--r--drivers/md/dm-linear.c1
-rw-r--r--drivers/md/dm-mpath.c3
-rw-r--r--drivers/md/dm-raid.c6
-rw-r--r--drivers/md/dm-raid1.c1
-rw-r--r--drivers/md/dm-rq.c15
-rw-r--r--drivers/md/dm-stripe.c2
-rw-r--r--drivers/md/dm-table.c49
-rw-r--r--drivers/md/dm-thin.c2
-rw-r--r--drivers/md/dm.c32
-rw-r--r--drivers/md/linear.c1
-rw-r--r--drivers/md/md.h7
-rw-r--r--drivers/md/multipath.c1
-rw-r--r--drivers/md/raid0.c2
-rw-r--r--drivers/md/raid1.c4
-rw-r--r--drivers/md/raid10.c1
-rw-r--r--drivers/md/raid5.c53
-rw-r--r--drivers/mmc/core/queue.c2
-rw-r--r--drivers/mtd/mtdcore.c23
-rw-r--r--drivers/mtd/mtdsuper.c6
-rw-r--r--drivers/mtd/ubi/block.c2
-rw-r--r--drivers/nvme/host/core.c94
-rw-r--r--drivers/nvme/host/fabrics.c28
-rw-r--r--drivers/nvme/host/fabrics.h10
-rw-r--r--drivers/nvme/host/fc.c223
-rw-r--r--drivers/nvme/host/lightnvm.c49
-rw-r--r--drivers/nvme/host/nvme.h48
-rw-r--r--drivers/nvme/host/pci.c223
-rw-r--r--drivers/nvme/host/rdma.c154
-rw-r--r--drivers/nvme/target/admin-cmd.c31
-rw-r--r--drivers/nvme/target/core.c21
-rw-r--r--drivers/nvme/target/discovery.c19
-rw-r--r--drivers/nvme/target/fabrics-cmd.c36
-rw-r--r--drivers/nvme/target/fc.c274
-rw-r--r--drivers/nvme/target/fcloop.c197
-rw-r--r--drivers/nvme/target/io-cmd.c24
-rw-r--r--drivers/nvme/target/loop.c92
-rw-r--r--drivers/nvme/target/nvmet.h11
-rw-r--r--drivers/nvme/target/rdma.c47
-rw-r--r--drivers/sbus/char/jsflash.c50
-rw-r--r--drivers/scsi/Makefile1
-rw-r--r--drivers/scsi/lpfc/lpfc_nvmet.c126
-rw-r--r--drivers/scsi/lpfc/lpfc_nvmet.h7
-rw-r--r--drivers/scsi/osd/osd_initiator.c9
-rw-r--r--drivers/scsi/osst.c4
-rw-r--r--drivers/scsi/qla2xxx/qla_bsg.c6
-rw-r--r--drivers/scsi/scsi_debugfs.c13
-rw-r--r--drivers/scsi/scsi_debugfs.h4
-rw-r--r--drivers/scsi/scsi_error.c2
-rw-r--r--drivers/scsi/scsi_lib.c25
-rw-r--r--drivers/scsi/scsi_transport_sas.c4
-rw-r--r--drivers/scsi/sd.c259
-rw-r--r--drivers/scsi/sd.h8
-rw-r--r--drivers/scsi/sd_zbc.c1
-rw-r--r--drivers/scsi/sg.c4
-rw-r--r--drivers/scsi/st.c8
-rw-r--r--drivers/staging/lustre/lustre/include/lustre_disk.h4
-rw-r--r--drivers/staging/lustre/lustre/llite/llite_lib.c24
-rw-r--r--drivers/target/iscsi/iscsi_target_configfs.c46
-rw-r--r--drivers/target/target_core_device.c2
-rw-r--r--drivers/target/target_core_pscsi.c4
-rw-r--r--fs/9p/v9fs.c10
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_super.c15
-rw-r--r--fs/afs/internal.h1
-rw-r--r--fs/afs/super.c5
-rw-r--r--fs/afs/volume.c8
-rw-r--r--fs/block_dev.c28
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/disk-io.c36
-rw-r--r--fs/btrfs/super.c7
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/super.c35
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/connect.c10
-rw-r--r--fs/coda/inode.c11
-rw-r--r--fs/dax.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h1
-rw-r--r--fs/ecryptfs/main.c4
-rw-r--r--fs/exofs/exofs.h1
-rw-r--r--fs/exofs/super.c17
-rw-r--r--fs/fuse/dev.c13
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/fuse/inode.c42
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/ncpfs/inode.c8
-rw-r--r--fs/ncpfs/ncp_fs_sb.h1
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/super.c33
-rw-r--r--fs/nfs/write.c13
-rw-r--r--fs/nfsd/blocklayout.c7
-rw-r--r--fs/nilfs2/super.c2
-rw-r--r--fs/super.c53
-rw-r--r--fs/ubifs/super.c25
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--include/linux/backing-dev-defs.h8
-rw-r--r--include/linux/backing-dev.h16
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-mq.h21
-rw-r--r--include/linux/blk_types.h47
-rw-r--r--include/linux/blkdev.h76
-rw-r--r--include/linux/coda_psdev.h1
-rw-r--r--include/linux/device-mapper.h11
-rw-r--r--include/linux/elevator.h4
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/genhd.h12
-rw-r--r--include/linux/ide.h2
-rw-r--r--include/linux/inet.h6
-rw-r--r--include/linux/kobject.h2
-rw-r--r--include/linux/lightnvm.h13
-rw-r--r--include/linux/mg_disk.h45
-rw-r--r--include/linux/mtd/mtd.h5
-rw-r--r--include/linux/nfs_fs_sb.h1
-rw-r--r--include/linux/nvme-fc-driver.h104
-rw-r--r--include/linux/nvme-fc.h68
-rw-r--r--include/linux/nvme.h13
-rw-r--r--include/linux/sbitmap.h55
-rw-r--r--include/linux/t10-pi.h8
-rw-r--r--include/linux/writeback.h1
-rw-r--r--include/scsi/scsi_request.h2
-rw-r--r--include/trace/events/block.h61
-rw-r--r--include/uapi/linux/lightnvm.h4
-rw-r--r--include/uapi/linux/nbd-netlink.h98
-rw-r--r--include/uapi/linux/nbd.h6
-rw-r--r--kernel/trace/blktrace.c35
-rw-r--r--lib/kobject.c5
-rw-r--r--lib/sbitmap.c75
-rw-r--r--mm/backing-dev.c186
-rw-r--r--net/core/utils.c103
255 files changed, 24643 insertions, 6152 deletions
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 2da04ce6aeef4..dea212db9df35 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -213,14 +213,8 @@ What: /sys/block/<disk>/queue/discard_zeroes_data
Date: May 2011
Contact: Martin K. Petersen <martin.petersen@oracle.com>
Description:
- Devices that support discard functionality may return
- stale or random data when a previously discarded block
- is read back. This can cause problems if the filesystem
- expects discarded blocks to be explicitly cleared. If a
- device reports that it deterministically returns zeroes
- when a discarded area is read the discard_zeroes_data
- parameter will be set to one. Otherwise it will be 0 and
- the result of reading a discarded area is undefined.
+ Will always return 0. Don't rely on any specific behavior
+ for discards, and don't read this file.
What: /sys/block/<disk>/queue/write_same_max_bytes
Date: January 2012
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103ace382a..8d55b4bbb5e2e 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,5 +1,7 @@
00-INDEX
- This file
+bfq-iosched.txt
+ - BFQ IO scheduler and its tunables
biodoc.txt
- Notes on the Generic Block Layer Rewrite in Linux 2.5
biovecs.txt
diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
new file mode 100644
index 0000000000000..1b87df6cd4761
--- /dev/null
+++ b/Documentation/block/bfq-iosched.txt
@@ -0,0 +1,531 @@
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+- BFQ guarantees a high system and application responsiveness, and a
+ low latency for time-sensitive applications, such as audio or video
+ players;
+- BFQ distributes bandwidth, and not just time, among processes or
+ groups (switching back to time distribution when needed to keep
+ throughput high).
+
+On average CPUs, the current version of BFQ can handle devices
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
+reference, 30-50 KIOPS correspond to very high bandwidths with
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
+to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on
+multi-queue devices.
+
+The table of contents follow. Impatients can just jump to Section 3.
+
+CONTENTS
+
+1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+2. How does BFQ work?
+3. What are BFQ's tunable?
+4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+ databases,
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+. audio and video-streaming with zero or very low jitter and drop
+ rate;
+
+. fast retrieval of WEB pages and embedded objects;
+
+. real-time recording of data in live-dumping applications (e.g.,
+ packet logging);
+
+. responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+ (bfq_)queue.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+ (process) at a time, and implements this service model by
+ associating every queue with a budget, measured in number of
+ sectors.
+
+ - After a queue is granted access to the device, the budget of the
+ queue is decremented, on each request dispatch, by the size of the
+ request.
+
+ - The in-service queue is expired, i.e., its service is suspended,
+ only if one of the following events occurs: 1) the queue finishes
+ its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+ - The budget timeout prevents processes doing random I/O from
+ holding the device for too long and dramatically reducing
+ throughput.
+
+ - Actually, as in CFQ, a queue associated with a process issuing
+ sync requests may not be expired immediately when it empties. In
+ contrast, BFQ may idle the device for a short time interval,
+ giving the process the chance to go on being served if it issues
+ a new request in time. Device idling typically boosts the
+ throughput on rotational devices, if processes do synchronous
+ and sequential I/O. In addition, under BFQ, device idling is
+ also instrumental in guaranteeing the desired throughput
+ fraction to processes issuing sync requests (see the description
+ of the slice_idle tunable in this document, or [1, 2], for more
+ details).
+
+ - With respect to idling for service guarantees, if several
+ processes are competing for the device at the same time, but
+ all processes (and groups, after the following commit) have
+ the same weight, then BFQ guarantees the expected throughput
+ distribution without ever idling the device. Throughput is
+ thus as high as possible in this common scenario.
+
+ - If low-latency mode is enabled (default configuration), BFQ
+ executes some special heuristics to detect interactive and soft
+ real-time applications (e.g., video or audio players/streamers),
+ and to reduce their latency. The most important action taken to
+ achieve this goal is to give to the queues associated with these
+ applications more than their fair share of the device
+ throughput. For brevity, we call just "weight-raising" the whole
+ sets of actions taken by BFQ to privilege these queues. In
+ particular, BFQ provides a milder form of weight-raising for
+ interactive applications, and a stronger form for soft real-time
+ applications.
+
+ - BFQ automatically deactivates idling for queues born in a burst of
+ queue creations. In fact, these queues are usually associated with
+ the processes of applications and services that benefit mostly
+ from a high throughput. Examples are systemd during boot, or git
+ grep.
+
+ - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+ performing random I/O that becomes mostly sequential if
+ merged. Differently from CFQ, BFQ achieves this goal with a more
+ reactive mechanism, called Early Queue Merge (EQM). EQM is so
+ responsive in detecting interleaved I/O (cooperating processes),
+ that it enables BFQ to achieve a high throughput, by queue
+ merging, even for queues for which CFQ needs a different
+ mechanism, preemption, to get a high throughput. As such EQM is a
+ unified mechanism to achieve a high throughput with interleaved
+ I/O.
+
+ - Queues are scheduled according to a variant of WF2Q+, named
+ B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+ O(log N) overall complexity. See [2] for more details. B-WF2Q+ is
+ also ready for hierarchical scheduling. However, for a cleaner
+ logical breakdown, the code that enables and completes
+ hierarchical support is provided in the next commit, which focuses
+ exactly on this feature.
+
+ - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+ perfectly fair, and smooth service. In particular, B-WF2Q+
+ guarantees that each queue receives a fraction of the device
+ throughput proportional to its weight, even if the throughput
+ fluctuates, and regardless of: the device parameters, the current
+ workload and the budgets assigned to the queue.
+
+ - The last, budget-independence, property (although probably
+ counterintuitive in the first place) is definitely beneficial, for
+ the following reasons:
+
+ - First, with any proportional-share scheduler, the maximum
+ deviation with respect to an ideal service is proportional to
+ the maximum budget (slice) assigned to queues. As a consequence,
+ BFQ can keep this deviation tight not only because of the
+ accurate service of B-WF2Q+, but also because BFQ *does not*
+ need to assign a larger budget to a queue to let the queue
+ receive a higher fraction of the device throughput.
+
+ - Second, BFQ is free to choose, for every process (queue), the
+ budget that best fits the needs of the process, or best
+ leverages the I/O pattern of the process. In particular, BFQ
+ updates queue budgets with a simple feedback-loop algorithm that
+ allows a high throughput to be achieved, while still providing
+ tight latency guarantees to time-sensitive applications. When
+ the in-service queue expires, this algorithm computes the next
+ budget of the queue so as to:
+
+ - Let large budgets be eventually assigned to the queues
+ associated with I/O-bound applications performing sequential
+ I/O: in fact, the longer these applications are served once
+ got access to the device, the higher the throughput is.
+
+ - Let small budgets be eventually assigned to the queues
+ associated with time-sensitive applications (which typically
+ perform sporadic and short I/O), because, the smaller the
+ budget assigned to a queue waiting for service is, the sooner
+ B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+ but all processes and groups have the same weight, then BFQ
+ guarantees the expected throughput distribution without ever idling
+ the device. It uses preemption instead. Throughput is then much
+ higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+ lower-priority queues are not served as long as there are
+ higher-priority queues. Among queues in the same class, the
+ bandwidth is distributed in proportion to the weight of each
+ queue. A very thin extra bandwidth is however guaranteed to
+ the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunable?
+==========================
+
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
+fifo_expire_sync below are the same as in CFQ. Their description is
+just copied from that for CFQ. Some considerations in the description
+of slice_idle are copied from CFQ too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used (see "4. BFQ group scheduling"),
+weights can be assigned to processes only indirectly, through I/O
+priorities, and according to the relation:
+weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+Beware that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration.
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0. In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), setting
+slice_idle=0 might end up in better throughput and acceptable
+latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput. As a consequence of this behavior, and
+of further issues described for the strict_guarantees tunable,
+short-term service guarantees may be occasionally violated. And, in
+some cases, these guarantees may be more important than guaranteeing
+maximum throughput. For example, in video playing/streaming, a very
+low drop rate may be more important than maximum throughput. In these
+cases, consider setting the strict_guarantees parameter.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+ new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable. The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+DO NOT enable this mode if you need full control on bandwidth
+distribution. In fact, if it is enabled, then BFQ automatically
+increases the bandwidth share of privileged applications, as the main
+means to guarantee a lower latency to them.
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+wr_ tunables
+------------
+
+BFQ exports a few parameters to control/tune the behavior of
+low-latency heuristics.
+
+wr_coeff
+
+Factor by which the weight of a weight-raised queue is multiplied. If
+the queue is deemed soft real-time, then the weight is further
+multiplied by an additional, constant factor.
+
+wr_max_time
+
+Maximum duration of a weight-raising period for an interactive task
+(ms). If set to zero (default value), then this value is computed
+automatically, as a function of the peak rate of the device. In any
+case, when the value of this parameter is read, it always reports the
+current duration, regardless of whether it has been set manually or
+computed automatically.
+
+wr_max_softrt_rate
+
+Maximum service rate below which a queue is deemed to be associated
+with a soft real-time application, and is then weight-raised
+accordingly (sectors/sec).
+
+wr_min_idle_time
+
+Minimum idle period after which interactive weight-raising may be
+reactivated for a queue (in ms).
+
+wr_rt_max_time
+
+Maximum weight-raising duration for soft real-time queues (in ms). The
+start time from which this duration is considered is automatically
+moved forward if the queue is detected to be still soft real-time
+before the current soft real-time weight-raising period finishes.
+
+wr_min_inter_arr_async
+
+Minimum period between I/O request arrivals after which weight-raising
+may be reactivated for an already busy async queue (in ms).
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely
+blkio and io. In particular, BFQ supports weight-based proportional
+share. To activate cgroups support, set BFQ_GROUP_IOSCHED.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+Recall that, if low-latency is set, then BFQ automatically raises the
+weight of the queues associated with interactive and soft real-time
+applications. Unset this tunable if you need/want to control weights.
+
+
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ Scheduler", Proceedings of the First Workshop on Mobile System
+ Technologies (MST-2015), May 2015.
+ http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2] P. Valente and M. Andreolini, "Improving Application
+ Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+ the 5th Annual International Systems and Storage Conference
+ (SYSTOR '12), June 2012.
+ Slightly extended version:
+ http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
+ results.pdf
diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt
new file mode 100644
index 0000000000000..e94feacd7edcd
--- /dev/null
+++ b/Documentation/block/kyber-iosched.txt
@@ -0,0 +1,14 @@
+Kyber I/O scheduler tunables
+===========================
+
+The only two tunables for the Kyber scheduler are the target latencies for
+reads and synchronous writes. Kyber will throttle requests in order to meet
+these target latencies.
+
+read_lat_nsec
+-------------
+Target latency for reads (in nanoseconds).
+
+write_lat_nsec
+--------------
+Target latency for synchronous writes (in nanoseconds).
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index c0a3bb5a6e4eb..2c1e67058fd3b 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue
smaller discards and potentially help reduce latencies induced by large
discard operations.
-discard_zeroes_data (RO)
-------------------------
-When read, this file will show if the discarded block are zeroed by the
-device or not. If its value is '1' the blocks are zeroed otherwise not.
-
hw_sector_size (RO)
-------------------
This is the hardware sector size of the device, in bytes.
@@ -192,5 +187,11 @@ scaling back writes. Writing a value of '0' to this file disables the
feature. Writing a value of '-1' to this file resets the value to the
default setting.
+throttle_sample_time (RW)
+-------------------------
+This is the time window that blk-throttle samples data, in millisecond.
+blk-throttle makes decision based on the samplings. Lower time means cgroups
+have more smooth throughput, but higher CPU overhead. This exists only when
+CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt
deleted file mode 100644
index f7e0505514871..0000000000000
--- a/Documentation/blockdev/mflash.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-This document describes m[g]flash support in linux.
-
-Contents
- 1. Overview
- 2. Reserved area configuration
- 3. Example of mflash platform driver registration
-
-1. Overview
-
-Mflash and gflash are embedded flash drive. The only difference is mflash is
-MCP(Multi Chip Package) device. These two device operate exactly same way.
-So the rest mflash repersents mflash and gflash altogether.
-
-Internally, mflash has nand flash and other hardware logics and supports
-2 different operation (ATA, IO) modes. ATA mode doesn't need any new
-driver and currently works well under standard IDE subsystem. Actually it's
-one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have
-IDE interface.
-
-Following are brief descriptions about IO mode.
-A. IO mode based on ATA protocol and uses some custom command. (read confirm,
-write confirm)
-B. IO mode uses SRAM bus interface.
-C. IO mode supports 4kB boot area, so host can boot from mflash.
-
-2. Reserved area configuration
-If host boot from mflash, usually needs raw area for boot loader image. All of
-the mflash's block device operation will be taken this value as start offset.
-Note that boot loader's size of reserved area and kernel configuration value
-must be same.
-
-3. Example of mflash platform driver registration
-Working mflash is very straight forward. Adding platform device stuff to board
-configuration file is all. Here is some pseudo example.
-
-static struct mg_drv_data mflash_drv_data = {
- /* If you want to polling driver set to 1 */
- .use_polling = 0,
- /* device attribution */
- .dev_attr = MG_BOOT_DEV
-};
-
-static struct resource mg_mflash_rsc[] = {
- /* Base address of mflash */
- [0] = {
- .start = 0x08000000,
- .end = 0x08000000 + SZ_64K - 1,
- .flags = IORESOURCE_MEM
- },
- /* mflash interrupt pin */
- [1] = {
- .start = IRQ_GPIO(84),
- .end = IRQ_GPIO(84),
- .flags = IORESOURCE_IRQ
- },
- /* mflash reset pin */
- [2] = {
- .start = 43,
- .end = 43,
- .name = MG_RST_PIN,
- .flags = IORESOURCE_IO
- },
- /* mflash reset-out pin
- * If you use mflash as storage device (i.e. other than MG_BOOT_DEV),
- * should assign this */
- [3] = {
- .start = 51,
- .end = 51,
- .name = MG_RSTOUT_PIN,
- .flags = IORESOURCE_IO
- }
-};
-
-static struct platform_device mflash_dev = {
- .name = MG_DEV_NAME,
- .id = -1,
- .dev = {
- .platform_data = &mflash_drv_data,
- },
- .num_resources = ARRAY_SIZE(mg_mflash_rsc),
- .resource = mg_mflash_rsc
-};
-
-platform_device_register(&mflash_dev);
diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/lightnvm/pblk.txt
new file mode 100644
index 0000000000000..1040ed1cec812
--- /dev/null
+++ b/Documentation/lightnvm/pblk.txt
@@ -0,0 +1,21 @@
+pblk: Physical Block Device Target
+==================================
+
+pblk implements a fully associative, host-based FTL that exposes a traditional
+block I/O interface. Its primary responsibilities are:
+
+ - Map logical addresses onto physical addresses (4KB granularity) in a
+ logical-to-physical (L2P) table.
+ - Maintain the integrity and consistency of the L2P table as well as its
+ recovery from normal tear down and power outage.
+ - Deal with controller- and media-specific constrains.
+ - Handle I/O errors.
+ - Implement garbage collection.
+ - Maintain consistency across the I/O stack during synchronization points.
+
+For more information please refer to:
+
+ http://lightnvm.io
+
+which maintains updated FAQs, manual pages, technical documentation, tools,
+contacts, etc.
diff --git a/MAINTAINERS b/MAINTAINERS
index 38d3e4ed7208b..1bb06c5f77166 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2544,6 +2544,14 @@ F: block/
F: kernel/trace/blktrace.c
F: lib/sbitmap.c
+BFQ I/O SCHEDULER
+M: Paolo Valente <paolo.valente@linaro.org>
+M: Jens Axboe <axboe@kernel.dk>
+L: linux-block@vger.kernel.org
+S: Maintained
+F: block/bfq-*
+F: Documentation/block/bfq-iosched.txt
+
BLOCK2MTD DRIVER
M: Joern Engel <joern@lazybastard.org>
L: linux-mtd@lists.infradead.org
diff --git a/block/Kconfig b/block/Kconfig
index e9f780f815f5d..89cd28f8d0512 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -115,6 +115,18 @@ config BLK_DEV_THROTTLING
See Documentation/cgroups/blkio-controller.txt for more information.
+config BLK_DEV_THROTTLING_LOW
+ bool "Block throttling .low limit interface support (EXPERIMENTAL)"
+ depends on BLK_DEV_THROTTLING
+ default n
+ ---help---
+ Add .low limit interface for block throttling. The low limit is a best
+ effort limit to prioritize cgroups. Depending on the setting, the limit
+ can be used to protect cgroups in terms of bandwidth/iops and better
+ utilize disk resource.
+
+ Note, this is an experimental interface and could be changed someday.
+
config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
default n
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 58fc8684788d1..fd2cefa47d354 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -40,6 +40,7 @@ config CFQ_GROUP_IOSCHED
Enable group IO scheduling in CFQ.
choice
+
prompt "Default I/O scheduler"
default DEFAULT_CFQ
help
@@ -69,6 +70,35 @@ config MQ_IOSCHED_DEADLINE
---help---
MQ version of the deadline IO scheduler.
+config MQ_IOSCHED_KYBER
+ tristate "Kyber I/O scheduler"
+ default y
+ ---help---
+ The Kyber I/O scheduler is a low-overhead scheduler suitable for
+ multiqueue and other fast devices. Given target latencies for reads and
+ synchronous writes, it will self-tune queue depths to achieve that
+ goal.
+
+config IOSCHED_BFQ
+ tristate "BFQ I/O scheduler"
+ default n
+ ---help---
+ BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
+ of the device among all processes according to their weights,
+ regardless of the device parameters and with any workload. It
+ also guarantees a low latency to interactive and soft
+ real-time applications. Details in
+ Documentation/block/bfq-iosched.txt
+
+config BFQ_GROUP_IOSCHED
+ bool "BFQ hierarchical scheduling support"
+ depends on IOSCHED_BFQ && BLK_CGROUP
+ default n
+ ---help---
+
+ Enable hierarchical scheduling in BFQ, using the blkio
+ (cgroups-v1) or io (cgroups-v2) controller.
+
endmenu
endif
diff --git a/block/Makefile b/block/Makefile
index 081bb680789bc..2b281cf258a0a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,6 +20,9 @@ obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
+obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
+bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
+obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 0000000000000..c8a32fb345cf5
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1139 @@
+/*
+ * cgroups support for the BFQ I/O scheduler.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "bfq-iosched.h"
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+ BFQG_stats_waiting = 0,
+ BFQG_stats_idling,
+ BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name) \
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags |= (1 << BFQG_stats_##name); \
+} \
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \
+{ \
+ stats->flags &= ~(1 << BFQG_stats_##name); \
+} \
+static int bfqg_stats_##name(struct bfqg_stats *stats) \
+{ \
+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \
+} \
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!bfqg_stats_waiting(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_group_wait_time))
+ blkg_stat_add(&stats->group_wait_time,
+ now - stats->start_group_wait_time);
+ bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+ struct bfq_group *curr_bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_waiting(stats))
+ return;
+ if (bfqg == curr_bfqg)
+ return;
+ stats->start_group_wait_time = sched_clock();
+ bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+ unsigned long long now;
+
+ if (!bfqg_stats_empty(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_empty_time))
+ blkg_stat_add(&stats->empty_time,
+ now - stats->start_empty_time);
+ bfqg_stats_clear_empty(stats);
+}
+
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+ blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (blkg_rwstat_total(&stats->queued))
+ return;
+
+ /*
+ * group is already marked empty. This can happen if bfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if (bfqg_stats_empty(stats))
+ return;
+
+ stats->start_empty_time = sched_clock();
+ bfqg_stats_mark_empty(stats);
+}
+
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ if (bfqg_stats_idling(stats)) {
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, stats->start_idle_time))
+ blkg_stat_add(&stats->idle_time,
+ now - stats->start_idle_time);
+ bfqg_stats_clear_idling(stats);
+ }
+}
+
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ stats->start_idle_time = sched_clock();
+ bfqg_stats_mark_idling(stats);
+}
+
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+
+ blkg_stat_add(&stats->avg_queue_size_sum,
+ blkg_rwstat_total(&stats->queued));
+ blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ bfqg_stats_update_group_wait_time(stats);
+}
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+ return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+ return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq));
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+ return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *group_entity = bfqq->entity.parent;
+
+ return group_entity ? container_of(group_entity, struct bfq_group,
+ entity) :
+ bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+ return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_put(struct bfq_group *bfqg)
+{
+ return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+ bfqg_stats_end_empty_time(&bfqg->stats);
+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+}
+
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+{
+ blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+}
+
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op)
+{
+ struct bfqg_stats *stats = &bfqg->stats;
+ unsigned long long now = sched_clock();
+
+ if (time_after64(now, io_start_time))
+ blkg_rwstat_add(&stats->service_time, op,
+ now - io_start_time);
+ if (time_after64(io_start_time, start_time))
+ blkg_rwstat_add(&stats->wait_time, op,
+ io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_reset(&stats->merged);
+ blkg_rwstat_reset(&stats->service_time);
+ blkg_rwstat_reset(&stats->wait_time);
+ blkg_stat_reset(&stats->time);
+ blkg_stat_reset(&stats->avg_queue_size_sum);
+ blkg_stat_reset(&stats->avg_queue_size_samples);
+ blkg_stat_reset(&stats->dequeue);
+ blkg_stat_reset(&stats->group_wait_time);
+ blkg_stat_reset(&stats->idle_time);
+ blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+ if (!to || !from)
+ return;
+
+ /* queued stats shouldn't be cleared */
+ blkg_rwstat_add_aux(&to->merged, &from->merged);
+ blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+ blkg_stat_add_aux(&from->time, &from->time);
+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ blkg_stat_add_aux(&to->avg_queue_size_samples,
+ &from->avg_queue_size_samples);
+ blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+ blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's aux counts so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+ struct bfq_group *parent;
+
+ if (!bfqg) /* root_group */
+ return;
+
+ parent = bfqg_parent(bfqg);
+
+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+ if (unlikely(!parent))
+ return;
+
+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ if (bfqq) {
+ bfqq->ioprio = bfqq->new_ioprio;
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ bfqg_get(bfqg);
+ }
+ entity->parent = bfqg->my_entity; /* NULL for root group */
+ entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+ blkg_rwstat_exit(&stats->merged);
+ blkg_rwstat_exit(&stats->service_time);
+ blkg_rwstat_exit(&stats->wait_time);
+ blkg_rwstat_exit(&stats->queued);
+ blkg_stat_exit(&stats->time);
+ blkg_stat_exit(&stats->avg_queue_size_sum);
+ blkg_stat_exit(&stats->avg_queue_size_samples);
+ blkg_stat_exit(&stats->dequeue);
+ blkg_stat_exit(&stats->group_wait_time);
+ blkg_stat_exit(&stats->idle_time);
+ blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+ if (blkg_rwstat_init(&stats->merged, gfp) ||
+ blkg_rwstat_init(&stats->service_time, gfp) ||
+ blkg_rwstat_init(&stats->wait_time, gfp) ||
+ blkg_rwstat_init(&stats->queued, gfp) ||
+ blkg_stat_init(&stats->time, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ blkg_stat_init(&stats->dequeue, gfp) ||
+ blkg_stat_init(&stats->group_wait_time, gfp) ||
+ blkg_stat_init(&stats->idle_time, gfp) ||
+ blkg_stat_init(&stats->empty_time, gfp)) {
+ bfqg_stats_exit(stats);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+ struct bfq_group_data *bgd;
+
+ bgd = kzalloc(sizeof(*bgd), gfp);
+ if (!bgd)
+ return NULL;
+ return &bgd->pd;
+}
+
+void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+ struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+ kfree(cpd_to_bfqgd(cpd));
+}
+
+struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+ struct bfq_group *bfqg;
+
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+ if (!bfqg)
+ return NULL;
+
+ if (bfqg_stats_init(&bfqg->stats, gfp)) {
+ kfree(bfqg);
+ return NULL;
+ }
+
+ return &bfqg->pd;
+}
+
+void bfq_pd_init(struct blkg_policy_data *pd)
+{
+ struct blkcg_gq *blkg = pd_to_blkg(pd);
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+ struct bfq_data *bfqd = blkg->q->elevator->elevator_data;
+ struct bfq_entity *entity = &bfqg->entity;
+ struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg);
+
+ entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+ entity->my_sched_data = &bfqg->sched_data;
+ bfqg->my_entity = entity; /*
+ * the root_group's will be set to NULL
+ * in bfq_init_queue()
+ */
+ bfqg->bfqd = bfqd;
+ bfqg->active_entities = 0;
+ bfqg->rq_pos_tree = RB_ROOT;
+}
+
+void bfq_pd_free(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_exit(&bfqg->stats);
+ return kfree(bfqg);
+}
+
+void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+ struct bfq_group *parent)
+{
+ struct bfq_entity *entity;
+
+ entity = &bfqg->entity;
+ entity->parent = parent->my_entity;
+ entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct blkcg_gq *blkg;
+
+ blkg = blkg_lookup(blkcg, bfqd->queue);
+ if (likely(blkg))
+ return blkg_to_bfqg(blkg);
+ return NULL;
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+ struct blkcg *blkcg)
+{
+ struct bfq_group *bfqg, *parent;
+ struct bfq_entity *entity;
+
+ bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+ if (unlikely(!bfqg))
+ return NULL;
+
+ /*
+ * Update chain of bfq_groups as we might be handling a leaf group
+ * which, along with some of its relatives, has not been hooked yet
+ * to the private hierarchy of BFQ.
+ */
+ entity = &bfqg->entity;
+ for_each_entity(entity) {
+ bfqg = container_of(entity, struct bfq_group, entity);
+ if (bfqg != bfqd->root_group) {
+ parent = bfqg_parent(bfqg);
+ if (!parent)
+ parent = bfqd->root_group;
+ bfq_group_set_parent(bfqg, parent);
+ }
+ }
+
+ return bfqg;
+}
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one. Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ /* If bfqq is empty, then bfq_bfqq_expire also invokes
+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+ * from data structures related to current group. Otherwise we
+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+ * we do below.
+ */
+ if (bfqq == bfqd->in_service_queue)
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+ false, BFQQE_PREEMPTED);
+
+ if (bfq_bfqq_busy(bfqq))
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ else if (entity->on_st)
+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+ bfqg_put(bfqq_group(bfqq));
+
+ /*
+ * Here we use a reference to bfqg. We don't need a refcounter
+ * as the cgroup reference will not be dropped, so that its
+ * destroy() callback will not be invoked.
+ */
+ entity->parent = bfqg->my_entity;
+ entity->sched_data = &bfqg->sched_data;
+ bfqg_get(bfqg);
+
+ if (bfq_bfqq_busy(bfqq)) {
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ bfq_activate_bfqq(bfqd, bfqq);
+ }
+
+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+ bfq_schedule_dispatch(bfqd);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct blkcg *blkcg)
+{
+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+ struct bfq_group *bfqg;
+ struct bfq_entity *entity;
+
+ bfqg = bfq_find_set_group(bfqd, blkcg);
+
+ if (unlikely(!bfqg))
+ bfqg = bfqd->root_group;
+
+ if (async_bfqq) {
+ entity = &async_bfqq->entity;
+
+ if (entity->sched_data != &bfqg->sched_data) {
+ bic_set_bfqq(bic, NULL, 0);
+ bfq_log_bfqq(bfqd, async_bfqq,
+ "bic_change_group: %p %d",
+ async_bfqq, async_bfqq->ref);
+ bfq_put_queue(async_bfqq);
+ }
+ }
+
+ if (sync_bfqq) {
+ entity = &sync_bfqq->entity;
+ if (entity->sched_data != &bfqg->sched_data)
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ }
+
+ return bfqg;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ struct bfq_group *bfqg = NULL;
+ uint64_t serial_nr;
+
+ rcu_read_lock();
+ serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+ /*
+ * Check whether blkcg has changed. The condition may trigger
+ * spuriously on a newly created cic but there's no harm.
+ */
+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+ goto out;
+
+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+ bic->blkcg_serial_nr = serial_nr;
+out:
+ rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+ struct bfq_entity *entity = st->first_idle;
+
+ for (; entity ; entity = st->first_idle)
+ __bfq_deactivate_entity(entity, false);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ * entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ struct bfq_service_tree *st)
+{
+ struct rb_root *active = &st->active;
+ struct bfq_entity *entity = NULL;
+
+ if (!RB_EMPTY_ROOT(&st->active))
+ entity = bfq_entity_of(rb_first(active));
+
+ for (; entity ; entity = bfq_entity_of(rb_first(active)))
+ bfq_reparent_leaf_entity(bfqd, entity);
+
+ if (bfqg->sched_data.in_service_entity)
+ bfq_reparent_leaf_entity(bfqd,
+ bfqg->sched_data.in_service_entity);
+}
+
+/**
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ * and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
+ *
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
+ */
+void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+ struct bfq_service_tree *st;
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+ struct bfq_data *bfqd = bfqg->bfqd;
+ struct bfq_entity *entity = bfqg->my_entity;
+ unsigned long flags;
+ int i;
+
+ if (!entity) /* root group */
+ return;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+ /*
+ * Empty all service_trees belonging to this group before
+ * deactivating the group itself.
+ */
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+ st = bfqg->sched_data.service_tree + i;
+
+ /*
+ * The idle tree may still contain bfq_queues belonging
+ * to exited task because they never migrated to a different
+ * cgroup from the one being destroyed now. No one else
+ * can access them so it's safe to act without any lock.
+ */
+ bfq_flush_idle_tree(st);
+
+ /*
+ * It may happen that some queues are still active
+ * (busy) upon group destruction (if the corresponding
+ * processes have been forced to terminate). We move
+ * all the leaf entities corresponding to these queues
+ * to the root_group.
+ * Also, it may happen that the group has an entity
+ * in service, which is disconnected from the active
+ * tree: it must be moved, too.
+ * There is no need to put the sync queues, as the
+ * scheduler has taken no reference.
+ */
+ bfq_reparent_active_entities(bfqd, bfqg, st);
+ }
+
+ __bfq_deactivate_entity(entity, false);
+ bfq_put_async_queues(bfqd, bfqg);
+
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ /*
+ * @blkg is going offline and will be ignored by
+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so
+ * that they don't get lost. If IOs complete after this point, the
+ * stats for them will be lost. Oh well...
+ */
+ bfqg_stats_xfer_dead(bfqg);
+}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+ struct blkcg_gq *blkg;
+
+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+ bfq_end_wr_async_queues(bfqd, bfqg);
+ }
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+ unsigned int val = 0;
+
+ if (bfqgd)
+ val = bfqgd->weight;
+
+ seq_printf(sf, "%u\n", val);
+
+ return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+ struct cftype *cftype,
+ u64 val)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+ struct blkcg_gq *blkg;
+ int ret = -ERANGE;
+
+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+ return ret;
+
+ ret = 0;
+ spin_lock_irq(&blkcg->lock);
+ bfqgd->weight = (unsigned short)val;
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+ if (!bfqg)
+ continue;
+ /*
+ * Setting the prio_changed flag of the entity
+ * to 1 with new_weight == weight would re-set
+ * the value of the weight to its ioprio mapping.
+ * Set the flag only if necessary.
+ */
+ if ((unsigned short)val != bfqg->entity.new_weight) {
+ bfqg->entity.new_weight = (unsigned short)val;
+ /*
+ * Make sure that the above new value has been
+ * stored in bfqg->entity.new_weight before
+ * setting the prio_changed flag. In fact,
+ * this flag may be read asynchronously (in
+ * critical sections protected by a different
+ * lock than that held here), and finding this
+ * flag set may cause the execution of the code
+ * for updating parameters whose value may
+ * depend also on bfqg->entity.new_weight (in
+ * __bfq_entity_update_weight_prio).
+ * This barrier makes sure that the new value
+ * of bfqg->entity.new_weight is correctly
+ * seen in that code.
+ */
+ smp_wmb();
+ bfqg->entity.prio_changed = 1;
+ }
+ }
+ spin_unlock_irq(&blkcg->lock);
+
+ return ret;
+}
+
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ u64 weight;
+ /* First unsigned long found in the file is used */
+ int ret = kstrtoull(strim(buf), 0, &weight);
+
+ if (ret)
+ return ret;
+
+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+}
+
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_bfq, off);
+ return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+ &blkcg_policy_bfq,
+ off);
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, false);
+ return 0;
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+ return 0;
+}
+
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+ false);
+ return 0;
+}
+
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+ u64 v = 0;
+
+ if (samples) {
+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+ v = div64_u64(v, samples);
+ }
+ __blkg_prfill_u64(sf, pd, v);
+ return 0;
+}
+
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+ 0, false);
+ return 0;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+ int ret;
+
+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+ if (ret)
+ return NULL;
+
+ return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+struct blkcg_policy blkcg_policy_bfq = {
+ .dfl_cftypes = bfq_blkg_files,
+ .legacy_cftypes = bfq_blkcg_legacy_files,
+
+ .cpd_alloc_fn = bfq_cpd_alloc,
+ .cpd_init_fn = bfq_cpd_init,
+ .cpd_bind_fn = bfq_cpd_init,
+ .cpd_free_fn = bfq_cpd_free,
+
+ .pd_alloc_fn = bfq_pd_alloc,
+ .pd_init_fn = bfq_pd_init,
+ .pd_offline_fn = bfq_pd_offline,
+ .pd_free_fn = bfq_pd_free,
+ .pd_reset_stats_fn = bfq_pd_reset_stats,
+};
+
+struct cftype bfq_blkcg_legacy_files[] = {
+ {
+ .name = "bfq.weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write_u64 = bfq_io_set_weight_legacy,
+ },
+
+ /* statistics, covers only the tasks in the bfqg */
+ {
+ .name = "bfq.time",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.sectors",
+ .seq_show = bfqg_print_stat_sectors,
+ },
+ {
+ .name = "bfq.io_service_bytes",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_bytes,
+ },
+ {
+ .name = "bfq.io_serviced",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_ios,
+ },
+ {
+ .name = "bfq.io_service_time",
+ .private = offsetof(struct bfq_group, stats.service_time),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_wait_time",
+ .private = offsetof(struct bfq_group, stats.wait_time),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_merged",
+ .private = offsetof(struct bfq_group, stats.merged),
+ .seq_show = bfqg_print_rwstat,
+ },
+ {
+ .name = "bfq.io_queued",
+ .private = offsetof(struct bfq_group, stats.queued),
+ .seq_show = bfqg_print_rwstat,
+ },
+
+ /* the same statictics which cover the bfqg and its descendants */
+ {
+ .name = "bfq.time_recursive",
+ .private = offsetof(struct bfq_group, stats.time),
+ .seq_show = bfqg_print_stat_recursive,
+ },
+ {
+ .name = "bfq.sectors_recursive",
+ .seq_show = bfqg_print_stat_sectors_recursive,
+ },
+ {
+ .name = "bfq.io_service_bytes_recursive",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_bytes_recursive,
+ },
+ {
+ .name = "bfq.io_serviced_recursive",
+ .private = (unsigned long)&blkcg_policy_bfq,
+ .seq_show = blkg_print_stat_ios_recursive,
+ },
+ {
+ .name = "bfq.io_service_time_recursive",
+ .private = offsetof(struct bfq_group, stats.service_time),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_wait_time_recursive",
+ .private = offsetof(struct bfq_group, stats.wait_time),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_merged_recursive",
+ .private = offsetof(struct bfq_group, stats.merged),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.io_queued_recursive",
+ .private = offsetof(struct bfq_group, stats.queued),
+ .seq_show = bfqg_print_rwstat_recursive,
+ },
+ {
+ .name = "bfq.avg_queue_size",
+ .seq_show = bfqg_print_avg_queue_size,
+ },
+ {
+ .name = "bfq.group_wait_time",
+ .private = offsetof(struct bfq_group, stats.group_wait_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.idle_time",
+ .private = offsetof(struct bfq_group, stats.idle_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.empty_time",
+ .private = offsetof(struct bfq_group, stats.empty_time),
+ .seq_show = bfqg_print_stat,
+ },
+ {
+ .name = "bfq.dequeue",
+ .private = offsetof(struct bfq_group, stats.dequeue),
+ .seq_show = bfqg_print_stat,
+ },
+ { } /* terminate */
+};
+
+struct cftype bfq_blkg_files[] = {
+ {
+ .name = "bfq.weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write = bfq_io_set_weight,
+ },
+ {} /* terminate */
+};
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op) { }
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg) {}
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->weight = entity->new_weight;
+ entity->orig_weight = entity->new_weight;
+ if (bfqq) {
+ bfqq->ioprio = bfqq->new_ioprio;
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ }
+ entity->sched_data = &bfqg->sched_data;
+}
+
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {}
+
+void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+ bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg)
+{
+ return bfqd->root_group;
+}
+
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+ return bfqq->bfqd->root_group;
+}
+
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+ struct bfq_group *bfqg;
+ int i;
+
+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+ if (!bfqg)
+ return NULL;
+
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+ return bfqg;
+}
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 0000000000000..bd8499ef157ce
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,5047 @@
+/*
+ * Budget Fair Queueing (BFQ) I/O scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ * Arianna Avanzini <avanzini@google.com>
+ *
+ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BFQ is a proportional-share I/O scheduler, with some extra
+ * low-latency capabilities. BFQ also supports full hierarchical
+ * scheduling through cgroups. Next paragraphs provide an introduction
+ * on BFQ inner workings. Details on BFQ benefits, usage and
+ * limitations can be found in Documentation/block/bfq-iosched.txt.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Each
+ * process/queue is assigned a user-configurable weight, and B-WF2Q+
+ * guarantees that each queue receives a fraction of the throughput
+ * proportional to its weight. Thanks to the accurate policy of
+ * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound
+ * processes issuing sequential requests (to boost the throughput),
+ * and yet guarantee a low latency to interactive and soft real-time
+ * applications.
+ *
+ * In particular, to provide these low-latency guarantees, BFQ
+ * explicitly privileges the I/O of two classes of time-sensitive
+ * applications: interactive and soft real-time. This feature enables
+ * BFQ to provide applications in these classes with a very low
+ * latency. Finally, BFQ also features additional heuristics for
+ * preserving both a low latency and a high throughput on NCQ-capable,
+ * rotational or flash-based devices, and to get the job done quickly
+ * for applications consisting in many I/O-bound processes.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, which is described in [2], together with
+ * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+
+ * with O(log N) complexity derives from the one introduced with EEVDF
+ * in [3].
+ *
+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+ * Scheduler", Proceedings of the First Workshop on Mobile System
+ * Technologies (MST-2015), May 2015.
+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing
+ * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689,
+ * Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline
+ * First: A Flexible and Accurate Mechanism for Proportional Share
+ * Resource Allocation", technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/ktime.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+#include "bfq-iosched.h"
+
+#define BFQ_BFQQ_FNS(name) \
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ __set_bit(BFQQF_##name, &(bfqq)->flags); \
+} \
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \
+{ \
+ __clear_bit(BFQQF_##name, &(bfqq)->flags); \
+} \
+int bfq_bfqq_##name(const struct bfq_queue *bfqq) \
+{ \
+ return test_bit(BFQQF_##name, &(bfqq)->flags); \
+}
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS \
+
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
+
+/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
+static const int bfq_back_max = 16 * 1024;
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in ns. */
+static u64 bfq_slice_idle = NSEC_PER_SEC / 125;
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = 16 * 1024;
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+const int bfq_timeout = HZ / 8;
+
+static struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC)
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD 4
+#define BFQ_HW_QUEUE_SAMPLES 32
+
+#define BFQQ_SEEK_THR (sector_t)(8 * 100)
+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8)
+
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES 32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT 16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ * SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+ return bic->bfqq[is_sync];
+}
+
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+{
+ bic->bfqq[is_sync] = bfqq;
+}
+
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+ return bic->icq.q->elevator->elevator_data;
+}
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+ /* bic->icq is the first member, %NULL will convert to %NULL */
+ return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ * @q: the request queue.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+ struct io_context *ioc,
+ struct request_queue *q)
+{
+ if (ioc) {
+ unsigned long flags;
+ struct bfq_io_cq *icq;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ icq = icq_to_bic(ioc_lookup_icq(ioc, q));
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return icq;
+ }
+
+ return NULL;
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+ if (bfqd->queued != 0) {
+ bfq_log(bfqd, "schedule dispatch");
+ blk_mq_run_hw_queues(bfqd->queue, true);
+ }
+}
+
+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples) ((samples) > 80)
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now. Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+ struct request *rq1,
+ struct request *rq2,
+ sector_t last)
+{
+ sector_t s1, s2, d1 = 0, d2 = 0;
+ unsigned long back_max;
+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */
+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
+
+ if (!rq1 || rq1 == rq2)
+ return rq2;
+ if (!rq2)
+ return rq1;
+
+ if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+ return rq1;
+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+ return rq2;
+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+ return rq1;
+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+ return rq2;
+
+ s1 = blk_rq_pos(rq1);
+ s2 = blk_rq_pos(rq2);
+
+ /*
+ * By definition, 1KiB is 2 sectors.
+ */
+ back_max = bfqd->bfq_back_max * 2;
+
+ /*
+ * Strict one way elevator _except_ in the case where we allow
+ * short backward seeks which are biased as twice the cost of a
+ * similar forward seek.
+ */
+ if (s1 >= last)
+ d1 = s1 - last;
+ else if (s1 + back_max >= last)
+ d1 = (last - s1) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ1_WRAP;
+
+ if (s2 >= last)
+ d2 = s2 - last;
+ else if (s2 + back_max >= last)
+ d2 = (last - s2) * bfqd->bfq_back_penalty;
+ else
+ wrap |= BFQ_RQ2_WRAP;
+
+ /* Found required data */
+
+ /*
+ * By doing switch() on the bit mask "wrap" we avoid having to
+ * check two variables for all permutations: --> faster!
+ */
+ switch (wrap) {
+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+ if (d1 < d2)
+ return rq1;
+ else if (d2 < d1)
+ return rq2;
+
+ if (s1 >= s2)
+ return rq1;
+ else
+ return rq2;
+
+ case BFQ_RQ2_WRAP:
+ return rq1;
+ case BFQ_RQ1_WRAP:
+ return rq2;
+ case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */
+ default:
+ /*
+ * Since both rqs are wrapped,
+ * start with the one that's further behind head
+ * (--> only *one* back seek required),
+ * since back seek takes more time than forward.
+ */
+ if (s1 <= s2)
+ return rq1;
+ else
+ return rq2;
+ }
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+ sector_t sector, struct rb_node **ret_parent,
+ struct rb_node ***rb_link)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *bfqq = NULL;
+
+ parent = NULL;
+ p = &root->rb_node;
+ while (*p) {
+ struct rb_node **n;
+
+ parent = *p;
+ bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+ /*
+ * Sort strictly based on sector. Smallest to the left,
+ * largest to the right.
+ */
+ if (sector > blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_right;
+ else if (sector < blk_rq_pos(bfqq->next_rq))
+ n = &(*p)->rb_left;
+ else
+ break;
+ p = n;
+ bfqq = NULL;
+ }
+
+ *ret_parent = parent;
+ if (rb_link)
+ *rb_link = p;
+
+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+ (unsigned long long)sector,
+ bfqq ? bfqq->pid : 0);
+
+ return bfqq;
+}
+
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct rb_node **p, *parent;
+ struct bfq_queue *__bfqq;
+
+ if (bfqq->pos_root) {
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+
+ if (bfq_class_idle(bfqq))
+ return;
+ if (!bfqq->next_rq)
+ return;
+
+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+ blk_rq_pos(bfqq->next_rq), &parent, &p);
+ if (!__bfqq) {
+ rb_link_node(&bfqq->pos_node, parent, p);
+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+ } else
+ bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+ /*
+ * For weights to differ, at least one of the trees must contain
+ * at least two nodes.
+ */
+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+ (bfqd->queue_weights_tree.rb_node->rb_left ||
+ bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ ) ||
+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+ (bfqd->group_weights_tree.rb_node->rb_left ||
+ bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+ );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ * weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ * number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+ return !bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+ /*
+ * Do not insert if the entity is already associated with a
+ * counter, which happens if:
+ * 1) the entity is associated with a queue,
+ * 2) a request arrival has caused the queue to become both
+ * non-weight-raised, and hence change its weight, and
+ * backlogged; in this respect, each of the two events
+ * causes an invocation of this function,
+ * 3) this is the invocation of this function caused by the
+ * second event. This second invocation is actually useless,
+ * and we handle this fact by exiting immediately. More
+ * efficient or clearer solutions might possibly be adopted.
+ */
+ if (entity->weight_counter)
+ return;
+
+ while (*new) {
+ struct bfq_weight_counter *__counter = container_of(*new,
+ struct bfq_weight_counter,
+ weights_node);
+ parent = *new;
+
+ if (entity->weight == __counter->weight) {
+ entity->weight_counter = __counter;
+ goto inc_counter;
+ }
+ if (entity->weight < __counter->weight)
+ new = &((*new)->rb_left);
+ else
+ new = &((*new)->rb_right);
+ }
+
+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+ GFP_ATOMIC);
+
+ /*
+ * In the unlucky event of an allocation failure, we just
+ * exit. This will cause the weight of entity to not be
+ * considered in bfq_differentiated_weights, which, in its
+ * turn, causes the scenario to be deemed wrongly symmetric in
+ * case entity's weight would have been the only weight making
+ * the scenario asymmetric. On the bright side, no unbalance
+ * will however occur when entity becomes inactive again (the
+ * invocation of this function is triggered by an activation
+ * of entity). In fact, bfq_weights_tree_remove does nothing
+ * if !entity->weight_counter.
+ */
+ if (unlikely(!entity->weight_counter))
+ return;
+
+ entity->weight_counter->weight = entity->weight;
+ rb_link_node(&entity->weight_counter->weights_node, parent, new);
+ rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+ entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root)
+{
+ if (!entity->weight_counter)
+ return;
+
+ entity->weight_counter->num_active--;
+ if (entity->weight_counter->num_active > 0)
+ goto reset_entity_pointer;
+
+ rb_erase(&entity->weight_counter->weights_node, root);
+ kfree(entity->weight_counter);
+
+reset_entity_pointer:
+ entity->weight_counter = NULL;
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct request *rq;
+
+ if (bfq_bfqq_fifo_expire(bfqq))
+ return NULL;
+
+ bfq_mark_bfqq_fifo_expire(bfqq);
+
+ rq = rq_entry_fifo(bfqq->fifo.next);
+
+ if (rq == last || ktime_get_ns() < rq->fifo_time)
+ return NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
+ return rq;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct request *last)
+{
+ struct rb_node *rbnext = rb_next(&last->rb_node);
+ struct rb_node *rbprev = rb_prev(&last->rb_node);
+ struct request *next, *prev = NULL;
+
+ /* Follow expired path, else get first next available. */
+ next = bfq_check_fifo(bfqq, last);
+ if (next)
+ return next;
+
+ if (rbprev)
+ prev = rb_entry_rq(rbprev);
+
+ if (rbnext)
+ next = rb_entry_rq(rbnext);
+ else {
+ rbnext = rb_first(&bfqq->sort_list);
+ if (rbnext && rbnext != &last->rb_node)
+ next = rb_entry_rq(rbnext);
+ }
+
+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+ struct bfq_queue *bfqq)
+{
+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+ return blk_rq_sectors(rq);
+
+ /*
+ * If there are no weight-raised queues, then amplify service
+ * by just the async charge factor; otherwise amplify service
+ * by twice the async charge factor, to further reduce latency
+ * for weight-raised queues.
+ */
+ if (bfqq->bfqd->wr_busy_queues == 0)
+ return blk_rq_sectors(rq) * bfq_async_charge_factor;
+
+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown). We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct request *next_rq = bfqq->next_rq;
+ unsigned long new_budget;
+
+ if (!next_rq)
+ return;
+
+ if (bfqq == bfqd->in_service_queue)
+ /*
+ * In order not to break guarantees, budgets cannot be
+ * changed after an entity has been selected.
+ */
+ return;
+
+ new_budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+ if (entity->budget != new_budget) {
+ entity->budget = new_budget;
+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+ new_budget);
+ bfq_requeue_bfqq(bfqd, bfqq);
+ }
+}
+
+static void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+ if (bic->saved_idle_window)
+ bfq_mark_bfqq_idle_window(bfqq);
+ else
+ bfq_clear_bfqq_idle_window(bfqq);
+
+ if (bic->saved_IO_bound)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ else
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ bfqq->ttime = bic->saved_ttime;
+ bfqq->wr_coeff = bic->saved_wr_coeff;
+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+
+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time))) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "resume state: switching off wr");
+
+ bfqq->wr_coeff = 1;
+ }
+
+ /* make sure weight will be updated, however we got here */
+ bfqq->entity.prio_changed = 1;
+}
+
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st;
+}
+
+/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_queue *item;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+ hlist_del_init(&item->burst_list_node);
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+ bfqd->burst_size = 1;
+ bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /* Increment burst size to take into account also bfqq */
+ bfqd->burst_size++;
+
+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+ struct bfq_queue *pos, *bfqq_item;
+ struct hlist_node *n;
+
+ /*
+ * Enough queues have been activated shortly after each
+ * other to consider this burst as large.
+ */
+ bfqd->large_burst = true;
+
+ /*
+ * We can now mark all queues in the burst list as
+ * belonging to a large burst.
+ */
+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+ burst_list_node)
+ bfq_mark_bfqq_in_large_burst(bfqq_item);
+ bfq_mark_bfqq_in_large_burst(bfqq);
+
+ /*
+ * From now on, and until the current burst finishes, any
+ * new queue being activated shortly after the last queue
+ * was inserted in the burst can be immediately marked as
+ * belonging to a large burst. So the burst list is not
+ * needed any more. Remove it.
+ */
+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+ burst_list_node)
+ hlist_del_init(&pos->burst_list_node);
+ } else /*
+ * Burst not yet large: add bfqq to the burst list. Do
+ * not increment the ref counter for bfqq, because bfqq
+ * is removed from the burst list before freeing bfqq
+ * in put_queue.
+ */
+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ * list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ * not yet belong to the burst is activated shortly after the last time
+ * at which a new queue entered the burst list, then the function appends
+ * Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ * the large-burst threshold, then
+ *
+ * . all the queues in the burst list are marked as belonging to a
+ * large burst
+ *
+ * . the burst list is deleted; in fact, the burst list already served
+ * its purpose (keeping temporarily track of the queues in a burst,
+ * so as to be able to mark them as belonging to a large burst in the
+ * previous sub-step), and now is not needed any more
+ *
+ * . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ * the device is in large-burst mode and shortly after the last time
+ * at which a queue either entered the burst list or was marked as
+ * belonging to the current large burst, then Q is immediately marked
+ * as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ * later, i.e., not shortly after, than the last time at which a queue
+ * either entered the burst list or was marked as belonging to the
+ * current large burst, then the current burst is deemed as finished and:
+ *
+ * . the large-burst mode is reset if set
+ *
+ * . the burst list is emptied
+ *
+ * . Q is inserted in the burst list, as Q may be the first queue
+ * in a possible new burst (then the burst list contains just Q
+ * after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * If bfqq is already in the burst list or is part of a large
+ * burst, or finally has just been split, then there is
+ * nothing else to do.
+ */
+ if (!hlist_unhashed(&bfqq->burst_list_node) ||
+ bfq_bfqq_in_large_burst(bfqq) ||
+ time_is_after_eq_jiffies(bfqq->split_time +
+ msecs_to_jiffies(10)))
+ return;
+
+ /*
+ * If bfqq's creation happens late enough, or bfqq belongs to
+ * a different group than the burst group, then the current
+ * burst is finished, and related data structures must be
+ * reset.
+ *
+ * In this respect, consider the special case where bfqq is
+ * the very first queue created after BFQ is selected for this
+ * device. In this case, last_ins_in_burst and
+ * burst_parent_entity are not yet significant when we get
+ * here. But it is easy to verify that, whether or not the
+ * following condition is true, bfqq will end up being
+ * inserted into the burst list. In particular the list will
+ * happen to contain only bfqq. And this is exactly what has
+ * to happen, as bfqq may be the first queue of the first
+ * burst.
+ */
+ if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+ bfqd->bfq_burst_interval) ||
+ bfqq->entity.parent != bfqd->burst_parent_entity) {
+ bfqd->large_burst = false;
+ bfq_reset_burst_list(bfqd, bfqq);
+ goto end;
+ }
+
+ /*
+ * If we get here, then bfqq is being activated shortly after the
+ * last queue. So, if the current burst is also large, we can mark
+ * bfqq as belonging to this large burst immediately.
+ */
+ if (bfqd->large_burst) {
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ goto end;
+ }
+
+ /*
+ * If we get here, then a large-burst state has not yet been
+ * reached, but bfqq is being activated shortly after the last
+ * queue. Then we add bfqq to the burst.
+ */
+ bfq_add_to_burst(bfqd, bfqq);
+end:
+ /*
+ * At this point, bfqq either has been added to the current
+ * burst or has caused the current burst to terminate and a
+ * possible new burst to start. In particular, in the second
+ * case, bfqq has become the first queue in the possible new
+ * burst. In both cases last_ins_in_burst needs to be moved
+ * forward.
+ */
+ bfqd->last_ins_in_burst = jiffies;
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget;
+ else
+ return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+ if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+ return bfq_default_max_budget / 32;
+ else
+ return bfqd->bfq_max_budget / 32;
+}
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
+ *
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
+ *
+ * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling
+ * and did not make it to issue a new request before its last
+ * request was served;
+ *
+ * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue
+ * a new request before the expiration of the idling-time.
+ *
+ * Even if bfqq has expired for one of the above reasons, the process
+ * associated with the queue may be however issuing requests greedily,
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
+ * idling, I/O throttling somewhere in the path from the process to
+ * the I/O scheduler, ...). But if, after every expiration for one of
+ * the above two reasons, bfqq has to wait for the service of at least
+ * one full budget of another queue before being served again, then
+ * bfqq is likely to get a much lower bandwidth or resource time than
+ * its reserved ones. To address this issue, two countermeasures need
+ * to be taken.
+ *
+ * First, the budget and the timestamps of bfqq need to be updated in
+ * a special way on bfqq reactivation: they need to be updated as if
+ * bfqq did not remain idle and did not expire. In fact, if they are
+ * computed as if bfqq expired and remained idle until reactivation,
+ * then the process associated with bfqq is treated as if, instead of
+ * being greedy, it stopped issuing requests when bfqq remained idle,
+ * and restarts issuing requests only on this reactivation. In other
+ * words, the scheduler does not help the process recover the "service
+ * hole" between bfqq expiration and reactivation. As a consequence,
+ * the process receives a lower bandwidth than its reserved one. In
+ * contrast, to recover this hole, the budget must be updated as if
+ * bfqq was not expired at all before this reactivation, i.e., it must
+ * be set to the value of the remaining budget when bfqq was
+ * expired. Along the same line, timestamps need to be assigned the
+ * value they had the last time bfqq was selected for service, i.e.,
+ * before last expiration. Thus timestamps need to be back-shifted
+ * with respect to their normal computation (see [1] for more details
+ * on this tricky aspect).
+ *
+ * Secondly, to allow the process to recover the hole, the in-service
+ * queue must be expired too, to give bfqq the chance to preempt it
+ * immediately. In fact, if bfqq has to wait for a full budget of the
+ * in-service queue to be completed, then it may become impossible to
+ * let the process recover the hole, even if the back-shifted
+ * timestamps of bfqq are lower than those of the in-service queue. If
+ * this happens for most or all of the holes, then the process may not
+ * receive its reserved bandwidth. In this respect, it is worth noting
+ * that, being the service of outstanding requests unpreemptible, a
+ * little fraction of the holes may however be unrecoverable, thereby
+ * causing a little loss of bandwidth.
+ *
+ * The last important point is detecting whether bfqq does need this
+ * bandwidth recovery. In this respect, the next function deems the
+ * process associated with bfqq greedy, and thus allows it to recover
+ * the hole, if: 1) the process is waiting for the arrival of a new
+ * request (which implies that bfqq expired for one of the above two
+ * reasons), and 2) such a request has arrived soon. The first
+ * condition is controlled through the flag non_blocking_wait_rq,
+ * while the second through the flag arrived_in_time. If both
+ * conditions hold, then the function computes the budget in the
+ * above-described special way, and signals that the in-service queue
+ * should be expired. Timestamp back-shifting is done later in
+ * __bfq_activate_entity.
+ *
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
+ * the process associated with bfqq recover a service hole, bfqq may
+ * however happen to have, after being (re)activated, a lower finish
+ * timestamp than the in-service queue. That is, the next budget of
+ * bfqq may have to be completed before the one of the in-service
+ * queue. If this is the case, then preempting the in-service queue
+ * allows this goal to be achieved, apart from the unpreemptible,
+ * outstanding requests mentioned above.
+ *
+ * Unfortunately, regardless of which of the above two goals one wants
+ * to achieve, service trees need first to be updated to know whether
+ * the in-service queue must be preempted. To have service trees
+ * correctly updated, the in-service queue must be expired and
+ * rescheduled, and bfqq must be scheduled too. This is one of the
+ * most costly operations (in future versions, the scheduling
+ * mechanism may be re-designed in such a way to make it possible to
+ * know whether preemption is needed without needing to update service
+ * trees). In addition, queue preemptions almost always cause random
+ * I/O, and thus loss of throughput. Because of these facts, the next
+ * function adopts the following simple scheme to avoid both costly
+ * operations and too frequent preemptions: it requests the expiration
+ * of the in-service queue (unconditionally) only for queues that need
+ * to recover a hole, or that either are weight-raised or deserve to
+ * be weight-raised.
+ */
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool arrived_in_time,
+ bool wr_or_deserves_wr)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+ /*
+ * We do not clear the flag non_blocking_wait_rq here, as
+ * the latter is used in bfq_activate_bfqq to signal
+ * that timestamps need to be back-shifted (and is
+ * cleared right after).
+ */
+
+ /*
+ * In next assignment we rely on that either
+ * entity->service or entity->budget are not updated
+ * on expiration if bfqq is empty (see
+ * __bfq_bfqq_recalc_budget). Thus both quantities
+ * remain unchanged after such an expiration, and the
+ * following statement therefore assigns to
+ * entity->budget the remaining budget on such an
+ * expiration. For clarity, entity->service is not
+ * updated on expiration in any case, and, in normal
+ * operation, is reset only when bfqq is selected for
+ * service (see bfq_get_next_queue).
+ */
+ entity->budget = min_t(unsigned long,
+ bfq_bfqq_budget_left(bfqq),
+ bfqq->max_budget);
+
+ return true;
+ }
+
+ entity->budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(bfqq->next_rq, bfqq));
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+ return wr_or_deserves_wr;
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+ u64 dur;
+
+ if (bfqd->bfq_wr_max_time > 0)
+ return bfqd->bfq_wr_max_time;
+
+ dur = bfqd->RT_prod;
+ do_div(dur, bfqd->peak_rate);
+
+ /*
+ * Limit duration between 3 and 13 seconds. Tests show that
+ * higher values than 13 seconds often yield the opposite of
+ * the desired result, i.e., worsen responsiveness by letting
+ * non-interactive and non-soft-real-time applications
+ * preserve weight raising for a too long time interval.
+ *
+ * On the other end, lower values than 3 seconds make it
+ * difficult for most interactive tasks to complete their jobs
+ * before weight-raising finishes.
+ */
+ if (dur > msecs_to_jiffies(13000))
+ dur = msecs_to_jiffies(13000);
+ else if (dur < msecs_to_jiffies(3000))
+ dur = msecs_to_jiffies(3000);
+
+ return dur;
+}
+
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ unsigned int old_wr_coeff,
+ bool wr_or_deserves_wr,
+ bool interactive,
+ bool in_burst,
+ bool soft_rt)
+{
+ if (old_wr_coeff == 1 && wr_or_deserves_wr) {
+ /* start a weight-raising period */
+ if (interactive) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ } else {
+ bfqq->wr_start_at_switch_to_srt = jiffies;
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+ BFQ_SOFTRT_WEIGHT_FACTOR;
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ }
+
+ /*
+ * If needed, further reduce budget to make sure it is
+ * close to bfqq's backlog, so as to reduce the
+ * scheduling-error component due to a too large
+ * budget. Do not care about throughput consequences,
+ * but only about latency. Finally, do not assign a
+ * too small budget either, to avoid increasing
+ * latency by causing too frequent expirations.
+ */
+ bfqq->entity.budget = min_t(unsigned long,
+ bfqq->entity.budget,
+ 2 * bfq_min_budget(bfqd));
+ } else if (old_wr_coeff > 1) {
+ if (interactive) { /* update wr coeff and duration */
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ } else if (in_burst)
+ bfqq->wr_coeff = 1;
+ else if (soft_rt) {
+ /*
+ * The application is now or still meeting the
+ * requirements for being deemed soft rt. We
+ * can then correctly and safely (re)charge
+ * the weight-raising duration for the
+ * application with the weight-raising
+ * duration for soft rt applications.
+ *
+ * In particular, doing this recharge now, i.e.,
+ * before the weight-raising period for the
+ * application finishes, reduces the probability
+ * of the following negative scenario:
+ * 1) the weight of a soft rt application is
+ * raised at startup (as for any newly
+ * created application),
+ * 2) since the application is not interactive,
+ * at a certain time weight-raising is
+ * stopped for the application,
+ * 3) at that time the application happens to
+ * still have pending requests, and hence
+ * is destined to not have a chance to be
+ * deemed soft rt before these requests are
+ * completed (see the comments to the
+ * function bfq_bfqq_softrt_next_start()
+ * for details on soft rt detection),
+ * 4) these pending requests experience a high
+ * latency because the application is not
+ * weight-raised while they are pending.
+ */
+ if (bfqq->wr_cur_max_time !=
+ bfqd->bfq_wr_rt_max_time) {
+ bfqq->wr_start_at_switch_to_srt =
+ bfqq->last_wr_start_finish;
+
+ bfqq->wr_cur_max_time =
+ bfqd->bfq_wr_rt_max_time;
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+ BFQ_SOFTRT_WEIGHT_FACTOR;
+ }
+ bfqq->last_wr_start_finish = jiffies;
+ }
+ }
+}
+
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return bfqq->dispatched == 0 &&
+ time_is_before_jiffies(
+ bfqq->budget_timeout +
+ bfqd->bfq_wr_min_idle_time);
+}
+
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ int old_wr_coeff,
+ struct request *rq,
+ bool *interactive)
+{
+ bool soft_rt, in_burst, wr_or_deserves_wr,
+ bfqq_wants_to_preempt,
+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
+ /*
+ * See the comments on
+ * bfq_bfqq_update_budg_for_activation for
+ * details on the usage of the next variable.
+ */
+ arrived_in_time = ktime_get_ns() <=
+ bfqq->ttime.last_end_request +
+ bfqd->bfq_slice_idle * 3;
+
+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags);
+
+ /*
+ * bfqq deserves to be weight-raised if:
+ * - it is sync,
+ * - it does not belong to a large burst,
+ * - it has been idle for enough time or is soft real-time,
+ * - is linked to a bfq_io_cq (it is not shared in any sense).
+ */
+ in_burst = bfq_bfqq_in_large_burst(bfqq);
+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+ !in_burst &&
+ time_is_before_jiffies(bfqq->soft_rt_next_start);
+ *interactive = !in_burst && idle_for_long_time;
+ wr_or_deserves_wr = bfqd->low_latency &&
+ (bfqq->wr_coeff > 1 ||
+ (bfq_bfqq_sync(bfqq) &&
+ bfqq->bic && (*interactive || soft_rt)));
+
+ /*
+ * Using the last flag, update budget and check whether bfqq
+ * may want to preempt the in-service queue.
+ */
+ bfqq_wants_to_preempt =
+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
+ arrived_in_time,
+ wr_or_deserves_wr);
+
+ /*
+ * If bfqq happened to be activated in a burst, but has been
+ * idle for much more than an interactive queue, then we
+ * assume that, in the overall I/O initiated in the burst, the
+ * I/O associated with bfqq is finished. So bfqq does not need
+ * to be treated as a queue belonging to a burst
+ * anymore. Accordingly, we reset bfqq's in_large_burst flag
+ * if set, and remove bfqq from the burst list if it's
+ * there. We do not decrement burst_size, because the fact
+ * that bfqq does not need to belong to the burst list any
+ * more does not invalidate the fact that bfqq was created in
+ * a burst.
+ */
+ if (likely(!bfq_bfqq_just_created(bfqq)) &&
+ idle_for_long_time &&
+ time_is_before_jiffies(
+ bfqq->budget_timeout +
+ msecs_to_jiffies(10000))) {
+ hlist_del_init(&bfqq->burst_list_node);
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ }
+
+ bfq_clear_bfqq_just_created(bfqq);
+
+
+ if (!bfq_bfqq_IO_bound(bfqq)) {
+ if (arrived_in_time) {
+ bfqq->requests_within_timer++;
+ if (bfqq->requests_within_timer >=
+ bfqd->bfq_requests_within_timer)
+ bfq_mark_bfqq_IO_bound(bfqq);
+ } else
+ bfqq->requests_within_timer = 0;
+ }
+
+ if (bfqd->low_latency) {
+ if (unlikely(time_is_after_jiffies(bfqq->split_time)))
+ /* wraparound */
+ bfqq->split_time =
+ jiffies - bfqd->bfq_wr_min_idle_time - 1;
+
+ if (time_is_before_jiffies(bfqq->split_time +
+ bfqd->bfq_wr_min_idle_time)) {
+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
+ old_wr_coeff,
+ wr_or_deserves_wr,
+ *interactive,
+ in_burst,
+ soft_rt);
+
+ if (old_wr_coeff != bfqq->wr_coeff)
+ bfqq->entity.prio_changed = 1;
+ }
+ }
+
+ bfqq->last_idle_bklogged = jiffies;
+ bfqq->service_from_backlogged = 0;
+ bfq_clear_bfqq_softrt_update(bfqq);
+
+ bfq_add_bfqq_busy(bfqd, bfqq);
+
+ /*
+ * Expire in-service queue only if preemption may be needed
+ * for guarantees. In this respect, the function
+ * next_queue_may_preempt just checks a simple, necessary
+ * condition, and not a sufficient condition based on
+ * timestamps. In fact, for the latter condition to be
+ * evaluated, timestamps would need first to be updated, and
+ * this operation is quite costly (see the comments on the
+ * function bfq_bfqq_update_budg_for_activation).
+ */
+ if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+ next_queue_may_preempt(bfqd))
+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+ false, BFQQE_PREEMPTED);
+}
+
+static void bfq_add_request(struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *next_rq, *prev;
+ unsigned int old_wr_coeff = bfqq->wr_coeff;
+ bool interactive = false;
+
+ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
+ bfqq->queued[rq_is_sync(rq)]++;
+ bfqd->queued++;
+
+ elv_rb_add(&bfqq->sort_list, rq);
+
+ /*
+ * Check if this request is a better next-serve candidate.
+ */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+ bfqq->next_rq = next_rq;
+
+ /*
+ * Adjust priority tree position, if next_rq changes.
+ */
+ if (prev != bfqq->next_rq)
+ bfq_pos_tree_add_move(bfqd, bfqq);
+
+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
+ rq, &interactive);
+ else {
+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+ time_is_before_jiffies(
+ bfqq->last_wr_start_finish +
+ bfqd->bfq_wr_min_inter_arr_async)) {
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+ bfqd->wr_busy_queues++;
+ bfqq->entity.prio_changed = 1;
+ }
+ if (prev != bfqq->next_rq)
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ /*
+ * Assign jiffies to last_wr_start_finish in the following
+ * cases:
+ *
+ * . if bfqq is not going to be weight-raised, because, for
+ * non weight-raised queues, last_wr_start_finish stores the
+ * arrival time of the last request; as of now, this piece
+ * of information is used only for deciding whether to
+ * weight-raise async queues
+ *
+ * . if bfqq is not weight-raised, because, if bfqq is now
+ * switching to weight-raised, then last_wr_start_finish
+ * stores the time when weight-raising starts
+ *
+ * . if bfqq is interactive, because, regardless of whether
+ * bfqq is currently weight-raised, the weight-raising
+ * period must start or restart (this case is considered
+ * separately because it is not detected by the above
+ * conditions, if bfqq is already weight-raised)
+ *
+ * last_wr_start_finish has to be updated also if bfqq is soft
+ * real-time, because the weight-raising period is constantly
+ * restarted on idle-to-busy transitions for these queues, but
+ * this is already done in bfq_bfqq_handle_idle_busy_switch if
+ * needed.
+ */
+ if (bfqd->low_latency &&
+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+ bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+ struct bio *bio,
+ struct request_queue *q)
+{
+ struct bfq_queue *bfqq = bfqd->bio_bfqq;
+
+
+ if (bfqq)
+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+ return NULL;
+}
+
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
+{
+ if (last_pos)
+ return abs(blk_rq_pos(rq) - last_pos);
+
+ return 0;
+}
+
+#if 0 /* Still not clear if we can do without next two functions */
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ bfqd->rq_in_driver++;
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ bfqd->rq_in_driver--;
+}
+#endif
+
+static void bfq_remove_request(struct request_queue *q,
+ struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ const int sync = rq_is_sync(rq);
+
+ if (bfqq->next_rq == rq) {
+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+ bfq_updated_next_req(bfqd, bfqq);
+ }
+
+ if (rq->queuelist.prev != &rq->queuelist)
+ list_del_init(&rq->queuelist);
+ bfqq->queued[sync]--;
+ bfqd->queued--;
+ elv_rb_del(&bfqq->sort_list, rq);
+
+ elv_rqhash_del(q, rq);
+ if (q->last_merge == rq)
+ q->last_merge = NULL;
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ bfqq->next_rq = NULL;
+
+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
+ /*
+ * bfqq emptied. In normal operation, when
+ * bfqq is empty, bfqq->entity.service and
+ * bfqq->entity.budget must contain,
+ * respectively, the service received and the
+ * budget used last time bfqq emptied. These
+ * facts do not hold in this case, as at least
+ * this last removal occurred while bfqq is
+ * not in service. To avoid inconsistencies,
+ * reset both bfqq->entity.service and
+ * bfqq->entity.budget, if bfqq has still a
+ * process that may issue I/O requests to it.
+ */
+ bfqq->entity.budget = bfqq->entity.service = 0;
+ }
+
+ /*
+ * Remove queue from request-position tree as it is empty.
+ */
+ if (bfqq->pos_root) {
+ rb_erase(&bfqq->pos_node, bfqq->pos_root);
+ bfqq->pos_root = NULL;
+ }
+ }
+
+ if (rq->cmd_flags & REQ_META)
+ bfqq->meta_pending--;
+
+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
+}
+
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct request *free = NULL;
+ /*
+ * bfq_bic_lookup grabs the queue_lock: invoke it now and
+ * store its return value for later use, to avoid nesting
+ * queue_lock inside the bfqd->lock. We assume that the bic
+ * returned by bfq_bic_lookup does not go away before
+ * bfqd->lock is taken.
+ */
+ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+ bool ret;
+
+ spin_lock_irq(&bfqd->lock);
+
+ if (bic)
+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
+ else
+ bfqd->bio_bfqq = NULL;
+ bfqd->bio_bic = bic;
+
+ ret = blk_mq_sched_try_merge(q, bio, &free);
+
+ if (free)
+ blk_mq_free_request(free);
+ spin_unlock_irq(&bfqd->lock);
+
+ return ret;
+}
+
+static int bfq_request_merge(struct request_queue *q, struct request **req,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct request *__rq;
+
+ __rq = bfq_find_rq_fmerge(bfqd, bio, q);
+ if (__rq && elv_bio_merge_ok(__rq, bio)) {
+ *req = __rq;
+ return ELEVATOR_FRONT_MERGE;
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_request_merged(struct request_queue *q, struct request *req,
+ enum elv_merge type)
+{
+ if (type == ELEVATOR_FRONT_MERGE &&
+ rb_prev(&req->rb_node) &&
+ blk_rq_pos(req) <
+ blk_rq_pos(container_of(rb_prev(&req->rb_node),
+ struct request, rb_node))) {
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct request *prev, *next_rq;
+
+ /* Reposition request in its sort_list */
+ elv_rb_del(&bfqq->sort_list, req);
+ elv_rb_add(&bfqq->sort_list, req);
+
+ /* Choose next request to be served for bfqq */
+ prev = bfqq->next_rq;
+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+ bfqd->last_position);
+ bfqq->next_rq = next_rq;
+ /*
+ * If next_rq changes, update both the queue's budget to
+ * fit the new request and the queue's position in its
+ * rq_pos_tree.
+ */
+ if (prev != bfqq->next_rq) {
+ bfq_updated_next_req(bfqd, bfqq);
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ }
+ }
+}
+
+static void bfq_requests_merged(struct request_queue *q, struct request *rq,
+ struct request *next)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ goto end;
+ spin_lock_irq(&bfqq->bfqd->lock);
+
+ /*
+ * If next and rq belong to the same bfq_queue and next is older
+ * than rq, then reposition rq in the fifo (by substituting next
+ * with rq). Otherwise, if next and rq belong to different
+ * bfq_queues, never reposition rq: in fact, we would have to
+ * reposition it with respect to next's position in its own fifo,
+ * which would most certainly be too expensive with respect to
+ * the benefits.
+ */
+ if (bfqq == next_bfqq &&
+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+ next->fifo_time < rq->fifo_time) {
+ list_del_init(&rq->queuelist);
+ list_replace_init(&next->queuelist, &rq->queuelist);
+ rq->fifo_time = next->fifo_time;
+ }
+
+ if (bfqq->next_rq == next)
+ bfqq->next_rq = rq;
+
+ bfq_remove_request(q, next);
+
+ spin_unlock_irq(&bfqq->bfqd->lock);
+end:
+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+}
+
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+ if (bfq_bfqq_busy(bfqq))
+ bfqq->bfqd->wr_busy_queues--;
+ bfqq->wr_coeff = 1;
+ bfqq->wr_cur_max_time = 0;
+ bfqq->last_wr_start_finish = jiffies;
+ /*
+ * Trigger a weight change on the next invocation of
+ * __bfq_entity_update_weight_prio.
+ */
+ bfqq->entity.prio_changed = 1;
+}
+
+void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+ struct bfq_group *bfqg)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ if (bfqg->async_bfqq[i][j])
+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+ if (bfqg->async_idle_bfqq)
+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+
+ spin_lock_irq(&bfqd->lock);
+
+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ bfq_end_wr_async(bfqd);
+
+ spin_unlock_irq(&bfqd->lock);
+}
+
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+{
+ if (request)
+ return blk_rq_pos(io_struct);
+ else
+ return ((struct bio *)io_struct)->bi_iter.bi_sector;
+}
+
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
+ sector_t sector)
+{
+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
+ BFQQ_CLOSE_THR;
+}
+
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ sector_t sector)
+{
+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_node *parent, *node;
+ struct bfq_queue *__bfqq;
+
+ if (RB_EMPTY_ROOT(root))
+ return NULL;
+
+ /*
+ * First, if we find a request starting at the end of the last
+ * request, choose it.
+ */
+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+ if (__bfqq)
+ return __bfqq;
+
+ /*
+ * If the exact sector wasn't found, the parent of the NULL leaf
+ * will contain the closest sector (rq_pos_tree sorted by
+ * next_request position).
+ */
+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+ return __bfqq;
+
+ if (blk_rq_pos(__bfqq->next_rq) < sector)
+ node = rb_next(&__bfqq->pos_node);
+ else
+ node = rb_prev(&__bfqq->pos_node);
+ if (!node)
+ return NULL;
+
+ __bfqq = rb_entry(node, struct bfq_queue, pos_node);
+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+ return __bfqq;
+
+ return NULL;
+}
+
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
+ struct bfq_queue *cur_bfqq,
+ sector_t sector)
+{
+ struct bfq_queue *bfqq;
+
+ /*
+ * We shall notice if some of the queues are cooperating,
+ * e.g., working closely on the same area of the device. In
+ * that case, we can group them together and: 1) don't waste
+ * time idling, and 2) serve the union of their requests in
+ * the best possible order for throughput.
+ */
+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
+ if (!bfqq || bfqq == cur_bfqq)
+ return NULL;
+
+ return bfqq;
+}
+
+static struct bfq_queue *
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+ int process_refs, new_process_refs;
+ struct bfq_queue *__bfqq;
+
+ /*
+ * If there are no process references on the new_bfqq, then it is
+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+ * may have dropped their last reference (not just their last process
+ * reference).
+ */
+ if (!bfqq_process_refs(new_bfqq))
+ return NULL;
+
+ /* Avoid a circular list and skip interim queue merges. */
+ while ((__bfqq = new_bfqq->new_bfqq)) {
+ if (__bfqq == bfqq)
+ return NULL;
+ new_bfqq = __bfqq;
+ }
+
+ process_refs = bfqq_process_refs(bfqq);
+ new_process_refs = bfqq_process_refs(new_bfqq);
+ /*
+ * If the process for the bfqq has gone away, there is no
+ * sense in merging the queues.
+ */
+ if (process_refs == 0 || new_process_refs == 0)
+ return NULL;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+ new_bfqq->pid);
+
+ /*
+ * Merging is just a redirection: the requests of the process
+ * owning one of the two queues are redirected to the other queue.
+ * The latter queue, in its turn, is set as shared if this is the
+ * first time that the requests of some process are redirected to
+ * it.
+ *
+ * We redirect bfqq to new_bfqq and not the opposite, because
+ * we are in the context of the process owning bfqq, thus we
+ * have the io_cq of this process. So we can immediately
+ * configure this io_cq to redirect the requests of the
+ * process to new_bfqq. In contrast, the io_cq of new_bfqq is
+ * not available any more (new_bfqq->bic == NULL).
+ *
+ * Anyway, even in case new_bfqq coincides with the in-service
+ * queue, redirecting requests the in-service queue is the
+ * best option, as we feed the in-service queue with new
+ * requests close to the last request served and, by doing so,
+ * are likely to increase the throughput.
+ */
+ bfqq->new_bfqq = new_bfqq;
+ new_bfqq->ref += process_refs;
+ return new_bfqq;
+}
+
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+ struct bfq_queue *new_bfqq)
+{
+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
+ (bfqq->ioprio_class != new_bfqq->ioprio_class))
+ return false;
+
+ /*
+ * If either of the queues has already been detected as seeky,
+ * then merging it with the other queue is unlikely to lead to
+ * sequential I/O.
+ */
+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
+ return false;
+
+ /*
+ * Interleaved I/O is known to be done by (some) applications
+ * only for reads, so it does not make sense to merge async
+ * queues.
+ */
+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
+ return false;
+
+ return true;
+}
+
+/*
+ * If this function returns true, then bfqq cannot be merged. The idea
+ * is that true cooperation happens very early after processes start
+ * to do I/O. Usually, late cooperations are just accidental false
+ * positives. In case bfqq is weight-raised, such false positives
+ * would evidently degrade latency guarantees for bfqq.
+ */
+static bool wr_from_too_long(struct bfq_queue *bfqq)
+{
+ return bfqq->wr_coeff > 1 &&
+ time_is_before_jiffies(bfqq->last_wr_start_finish +
+ msecs_to_jiffies(100));
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service
+ * queue or with a close queue among the scheduled queues. Return
+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * structure otherwise.
+ *
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
+ * the requests temporarily redirected to the OOM queue could be redirected
+ * again to dedicated queues at any time, the state needed to correctly
+ * handle merging with the OOM queue would be quite complex and expensive
+ * to maintain. Besides, in such a critical condition as an out of memory,
+ * the benefits of queue merging may be little relevant, or even negligible.
+ *
+ * Weight-raised queues can be merged only if their weight-raising
+ * period has just started. In fact cooperating processes are usually
+ * started together. Thus, with this filter we avoid false positives
+ * that would jeopardize low-latency guarantees.
+ *
+ * WARNING: queue merging may impair fairness among non-weight raised
+ * queues, for at least two reasons: 1) the original weight of a
+ * merged queue may change during the merged state, 2) even being the
+ * weight the same, a merged queue may be bloated with many more
+ * requests than the ones produced by its originally-associated
+ * process.
+ */
+static struct bfq_queue *
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ void *io_struct, bool request)
+{
+ struct bfq_queue *in_service_bfqq, *new_bfqq;
+
+ if (bfqq->new_bfqq)
+ return bfqq->new_bfqq;
+
+ if (!io_struct ||
+ wr_from_too_long(bfqq) ||
+ unlikely(bfqq == &bfqd->oom_bfqq))
+ return NULL;
+
+ /* If there is only one backlogged queue, don't search. */
+ if (bfqd->busy_queues == 1)
+ return NULL;
+
+ in_service_bfqq = bfqd->in_service_queue;
+
+ if (!in_service_bfqq || in_service_bfqq == bfqq
+ || wr_from_too_long(in_service_bfqq) ||
+ unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+ goto check_scheduled;
+
+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+ bfqq->entity.parent == in_service_bfqq->entity.parent &&
+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
+ if (new_bfqq)
+ return new_bfqq;
+ }
+ /*
+ * Check whether there is a cooperator among currently scheduled
+ * queues. The only thing we need is that the bio/request is not
+ * NULL, as we need it to establish whether a cooperator exists.
+ */
+check_scheduled:
+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
+ bfq_io_struct_pos(io_struct, request));
+
+ if (new_bfqq && !wr_from_too_long(new_bfqq) &&
+ likely(new_bfqq != &bfqd->oom_bfqq) &&
+ bfq_may_be_close_cooperator(bfqq, new_bfqq))
+ return bfq_setup_merge(bfqq, new_bfqq);
+
+ return NULL;
+}
+
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+{
+ struct bfq_io_cq *bic = bfqq->bic;
+
+ /*
+ * If !bfqq->bic, the queue is already shared or its requests
+ * have already been redirected to a shared queue; both idle window
+ * and weight raising state have already been saved. Do nothing.
+ */
+ if (!bic)
+ return;
+
+ bic->saved_ttime = bfqq->ttime;
+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+ bic->saved_wr_coeff = bfqq->wr_coeff;
+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+}
+
+static void
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+ (unsigned long)new_bfqq->pid);
+ /* Save weight raising and idle window of the merged queues */
+ bfq_bfqq_save_state(bfqq);
+ bfq_bfqq_save_state(new_bfqq);
+ if (bfq_bfqq_IO_bound(bfqq))
+ bfq_mark_bfqq_IO_bound(new_bfqq);
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ /*
+ * If bfqq is weight-raised, then let new_bfqq inherit
+ * weight-raising. To reduce false positives, neglect the case
+ * where bfqq has just been created, but has not yet made it
+ * to be weight-raised (which may happen because EQM may merge
+ * bfqq even before bfq_add_request is executed for the first
+ * time for bfqq). Handling this case would however be very
+ * easy, thanks to the flag just_created.
+ */
+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
+ new_bfqq->wr_coeff = bfqq->wr_coeff;
+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
+ new_bfqq->wr_start_at_switch_to_srt =
+ bfqq->wr_start_at_switch_to_srt;
+ if (bfq_bfqq_busy(new_bfqq))
+ bfqd->wr_busy_queues++;
+ new_bfqq->entity.prio_changed = 1;
+ }
+
+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
+ bfqq->wr_coeff = 1;
+ bfqq->entity.prio_changed = 1;
+ if (bfq_bfqq_busy(bfqq))
+ bfqd->wr_busy_queues--;
+ }
+
+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
+ bfqd->wr_busy_queues);
+
+ /*
+ * Merge queues (that is, let bic redirect its requests to new_bfqq)
+ */
+ bic_set_bfqq(bic, new_bfqq, 1);
+ bfq_mark_bfqq_coop(new_bfqq);
+ /*
+ * new_bfqq now belongs to at least two bics (it is a shared queue):
+ * set new_bfqq->bic to NULL. bfqq either:
+ * - does not belong to any bic any more, and hence bfqq->bic must
+ * be set to NULL, or
+ * - is a queue whose owning bics have already been redirected to a
+ * different queue, hence the queue is destined to not belong to
+ * any bic soon and bfqq->bic is already NULL (therefore the next
+ * assignment causes no harm).
+ */
+ new_bfqq->bic = NULL;
+ bfqq->bic = NULL;
+ /* release process reference to bfqq */
+ bfq_put_queue(bfqq);
+}
+
+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ bool is_sync = op_is_sync(bio->bi_opf);
+ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq;
+
+ /*
+ * Disallow merge of a sync bio into an async request.
+ */
+ if (is_sync && !rq_is_sync(rq))
+ return false;
+
+ /*
+ * Lookup the bfqq that this bio will be queued with. Allow
+ * merge only if rq is queued there.
+ */
+ if (!bfqq)
+ return false;
+
+ /*
+ * We take advantage of this function to perform an early merge
+ * of the queues of possible cooperating processes.
+ */
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+ if (new_bfqq) {
+ /*
+ * bic still points to bfqq, then it has not yet been
+ * redirected to some other bfq_queue, and a queue
+ * merge beween bfqq and new_bfqq can be safely
+ * fulfillled, i.e., bic can be redirected to new_bfqq
+ * and bfqq can be put.
+ */
+ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq,
+ new_bfqq);
+ /*
+ * If we get here, bio will be queued into new_queue,
+ * so use new_bfqq to decide whether bio and rq can be
+ * merged.
+ */
+ bfqq = new_bfqq;
+
+ /*
+ * Change also bqfd->bio_bfqq, as
+ * bfqd->bio_bic now points to new_bfqq, and
+ * this function may be invoked again (and then may
+ * use again bqfd->bio_bfqq).
+ */
+ bfqd->bio_bfqq = bfqq;
+ }
+
+ return bfqq == RQ_BFQQ(rq);
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the throughput.
+ * In practice, a time-slice service scheme is used with seeky
+ * processes.
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ unsigned int timeout_coeff;
+
+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+ timeout_coeff = 1;
+ else
+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+ bfqd->last_budget_start = ktime_get();
+
+ bfqq->budget_timeout = jiffies +
+ bfqd->bfq_timeout * timeout_coeff;
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ if (bfqq) {
+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+ bfq_clear_bfqq_fifo_expire(bfqq);
+
+ bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8;
+
+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
+ bfqq->wr_coeff > 1 &&
+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+ time_is_before_jiffies(bfqq->budget_timeout)) {
+ /*
+ * For soft real-time queues, move the start
+ * of the weight-raising period forward by the
+ * time the queue has not received any
+ * service. Otherwise, a relatively long
+ * service delay is likely to cause the
+ * weight-raising period of the queue to end,
+ * because of the short duration of the
+ * weight-raising period of a soft real-time
+ * queue. It is worth noting that this move
+ * is not so dangerous for the other queues,
+ * because soft real-time queues are not
+ * greedy.
+ *
+ * To not add a further variable, we use the
+ * overloaded field budget_timeout to
+ * determine for how long the queue has not
+ * received service, i.e., how much time has
+ * elapsed since the queue expired. However,
+ * this is a little imprecise, because
+ * budget_timeout is set to jiffies if bfqq
+ * not only expires, but also remains with no
+ * request.
+ */
+ if (time_after(bfqq->budget_timeout,
+ bfqq->last_wr_start_finish))
+ bfqq->last_wr_start_finish +=
+ jiffies - bfqq->budget_timeout;
+ else
+ bfqq->last_wr_start_finish = jiffies;
+ }
+
+ bfq_set_budget_timeout(bfqd, bfqq);
+ bfq_log_bfqq(bfqd, bfqq,
+ "set_in_service_queue, cur-budget = %d",
+ bfqq->entity.budget);
+ }
+
+ bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+ __bfq_set_in_service_queue(bfqd, bfqq);
+ return bfqq;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+ u32 sl;
+
+ bfq_mark_bfqq_wait_request(bfqq);
+
+ /*
+ * We don't want to idle for seeks, but we do want to allow
+ * fair distribution of slice time for a process doing back-to-back
+ * seeks. So allow a little bit of time for him to submit a new rq.
+ */
+ sl = bfqd->bfq_slice_idle;
+ /*
+ * Unless the queue is being weight-raised or the scenario is
+ * asymmetric, grant only minimum idle time if the queue
+ * is seeky. A long idling is preserved for a weight-raised
+ * queue, or, more in general, in an asymmetric scenario,
+ * because a long idling is needed for guaranteeing to a queue
+ * its reserved share of the throughput (in particular, it is
+ * needed if the queue has a higher weight than some other
+ * queue).
+ */
+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+ bfq_symmetric_scenario(bfqd))
+ sl = min_t(u64, sl, BFQ_MIN_TT);
+
+ bfqd->last_idling_start = ktime_get();
+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
+ HRTIMER_MODE_REL);
+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+}
+
+/*
+ * In autotuning mode, max_budget is dynamically recomputed as the
+ * amount of sectors transferred in timeout at the estimated peak
+ * rate. This enables BFQ to utilize a full timeslice with a full
+ * budget, even if the in-service queue is served at peak rate. And
+ * this maximises throughput with sequential workloads.
+ */
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
+{
+ return (u64)bfqd->peak_rate * USEC_PER_MSEC *
+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
+}
+
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+static void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+ int dev_type = blk_queue_nonrot(bfqd->queue);
+
+ if (bfqd->bfq_user_max_budget == 0)
+ bfqd->bfq_max_budget =
+ bfq_calc_max_budget(bfqd);
+
+ if (bfqd->device_speed == BFQ_BFQD_FAST &&
+ bfqd->peak_rate < device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_SLOW;
+ bfqd->RT_prod = R_slow[dev_type] *
+ T_slow[dev_type];
+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+ bfqd->peak_rate > device_speed_thresh[dev_type]) {
+ bfqd->device_speed = BFQ_BFQD_FAST;
+ bfqd->RT_prod = R_fast[dev_type] *
+ T_fast[dev_type];
+ }
+
+ bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+ dev_type == 0 ? "ROT" : "NONROT",
+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+ bfqd->device_speed == BFQ_BFQD_FAST ?
+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+ BFQ_RATE_SHIFT);
+}
+
+static void bfq_reset_rate_computation(struct bfq_data *bfqd,
+ struct request *rq)
+{
+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */
+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
+ bfqd->peak_rate_samples = 1;
+ bfqd->sequential_samples = 0;
+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
+ blk_rq_sectors(rq);
+ } else /* no new rq dispatched, just reset the number of samples */
+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
+
+ bfq_log(bfqd,
+ "reset_rate_computation at end, sample %u/%u tot_sects %llu",
+ bfqd->peak_rate_samples, bfqd->sequential_samples,
+ bfqd->tot_sectors_dispatched);
+}
+
+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
+{
+ u32 rate, weight, divisor;
+
+ /*
+ * For the convergence property to hold (see comments on
+ * bfq_update_peak_rate()) and for the assessment to be
+ * reliable, a minimum number of samples must be present, and
+ * a minimum amount of time must have elapsed. If not so, do
+ * not compute new rate. Just reset parameters, to get ready
+ * for a new evaluation attempt.
+ */
+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL)
+ goto reset_computation;
+
+ /*
+ * If a new request completion has occurred after last
+ * dispatch, then, to approximate the rate at which requests
+ * have been served by the device, it is more precise to
+ * extend the observation interval to the last completion.
+ */
+ bfqd->delta_from_first =
+ max_t(u64, bfqd->delta_from_first,
+ bfqd->last_completion - bfqd->first_dispatch);
+
+ /*
+ * Rate computed in sects/usec, and not sects/nsec, for
+ * precision issues.
+ */
+ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
+ div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
+
+ /*
+ * Peak rate not updated if:
+ * - the percentage of sequential dispatches is below 3/4 of the
+ * total, and rate is below the current estimated peak rate
+ * - rate is unreasonably high (> 20M sectors/sec)
+ */
+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 &&
+ rate <= bfqd->peak_rate) ||
+ rate > 20<<BFQ_RATE_SHIFT)
+ goto reset_computation;
+
+ /*
+ * We have to update the peak rate, at last! To this purpose,
+ * we use a low-pass filter. We compute the smoothing constant
+ * of the filter as a function of the 'weight' of the new
+ * measured rate.
+ *
+ * As can be seen in next formulas, we define this weight as a
+ * quantity proportional to how sequential the workload is,
+ * and to how long the observation time interval is.
+ *
+ * The weight runs from 0 to 8. The maximum value of the
+ * weight, 8, yields the minimum value for the smoothing
+ * constant. At this minimum value for the smoothing constant,
+ * the measured rate contributes for half of the next value of
+ * the estimated peak rate.
+ *
+ * So, the first step is to compute the weight as a function
+ * of how sequential the workload is. Note that the weight
+ * cannot reach 9, because bfqd->sequential_samples cannot
+ * become equal to bfqd->peak_rate_samples, which, in its
+ * turn, holds true because bfqd->sequential_samples is not
+ * incremented for the first sample.
+ */
+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
+
+ /*
+ * Second step: further refine the weight as a function of the
+ * duration of the observation interval.
+ */
+ weight = min_t(u32, 8,
+ div_u64(weight * bfqd->delta_from_first,
+ BFQ_RATE_REF_INTERVAL));
+
+ /*
+ * Divisor ranging from 10, for minimum weight, to 2, for
+ * maximum weight.
+ */
+ divisor = 10 - weight;
+
+ /*
+ * Finally, update peak rate:
+ *
+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor
+ */
+ bfqd->peak_rate *= divisor-1;
+ bfqd->peak_rate /= divisor;
+ rate /= divisor; /* smoothing constant alpha = 1/divisor */
+
+ bfqd->peak_rate += rate;
+ update_thr_responsiveness_params(bfqd);
+
+reset_computation:
+ bfq_reset_rate_computation(bfqd, rq);
+}
+
+/*
+ * Update the read/write peak rate (the main quantity used for
+ * auto-tuning, see update_thr_responsiveness_params()).
+ *
+ * It is not trivial to estimate the peak rate (correctly): because of
+ * the presence of sw and hw queues between the scheduler and the
+ * device components that finally serve I/O requests, it is hard to
+ * say exactly when a given dispatched request is served inside the
+ * device, and for how long. As a consequence, it is hard to know
+ * precisely at what rate a given set of requests is actually served
+ * by the device.
+ *
+ * On the opposite end, the dispatch time of any request is trivially
+ * available, and, from this piece of information, the "dispatch rate"
+ * of requests can be immediately computed. So, the idea in the next
+ * function is to use what is known, namely request dispatch times
+ * (plus, when useful, request completion times), to estimate what is
+ * unknown, namely in-device request service rate.
+ *
+ * The main issue is that, because of the above facts, the rate at
+ * which a certain set of requests is dispatched over a certain time
+ * interval can vary greatly with respect to the rate at which the
+ * same requests are then served. But, since the size of any
+ * intermediate queue is limited, and the service scheme is lossless
+ * (no request is silently dropped), the following obvious convergence
+ * property holds: the number of requests dispatched MUST become
+ * closer and closer to the number of requests completed as the
+ * observation interval grows. This is the key property used in
+ * the next function to estimate the peak service rate as a function
+ * of the observed dispatch rate. The function assumes to be invoked
+ * on every request dispatch.
+ */
+static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+{
+ u64 now_ns = ktime_get_ns();
+
+ if (bfqd->peak_rate_samples == 0) { /* first dispatch */
+ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
+ bfqd->peak_rate_samples);
+ bfq_reset_rate_computation(bfqd, rq);
+ goto update_last_values; /* will add one sample */
+ }
+
+ /*
+ * Device idle for very long: the observation interval lasting
+ * up to this dispatch cannot be a valid observation interval
+ * for computing a new peak rate (similarly to the late-
+ * completion event in bfq_completed_request()). Go to
+ * update_rate_and_reset to have the following three steps
+ * taken:
+ * - close the observation interval at the last (previous)
+ * request dispatch or completion
+ * - compute rate, if possible, for that observation interval
+ * - start a new observation interval with this dispatch
+ */
+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+ bfqd->rq_in_driver == 0)
+ goto update_rate_and_reset;
+
+ /* Update sampling information */
+ bfqd->peak_rate_samples++;
+
+ if ((bfqd->rq_in_driver > 0 ||
+ now_ns - bfqd->last_completion < BFQ_MIN_TT)
+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+ bfqd->sequential_samples++;
+
+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
+
+ /* Reset max observed rq size every 32 dispatches */
+ if (likely(bfqd->peak_rate_samples % 32))
+ bfqd->last_rq_max_size =
+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
+ else
+ bfqd->last_rq_max_size = blk_rq_sectors(rq);
+
+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
+
+ /* Target observation interval not yet reached, go on sampling */
+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
+ goto update_last_values;
+
+update_rate_and_reset:
+ bfq_update_rate_reset(bfqd, rq);
+update_last_values:
+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+ bfqd->last_dispatch = now_ns;
+}
+
+/*
+ * Remove request from internal lists.
+ */
+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+ /*
+ * For consistency, the next instruction should have been
+ * executed after removing the request from the queue and
+ * dispatching it. We execute instead this instruction before
+ * bfq_remove_request() (and hence introduce a temporary
+ * inconsistency), for efficiency. In fact, should this
+ * dispatch occur for a non in-service bfqq, this anticipated
+ * increment prevents two counters related to bfqq->dispatched
+ * from risking to be, first, uselessly decremented, and then
+ * incremented again when the (new) value of bfqq->dispatched
+ * happens to be taken into account.
+ */
+ bfqq->dispatched++;
+ bfq_update_peak_rate(q->elevator->elevator_data, rq);
+
+ bfq_remove_request(q, rq);
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * If this bfqq is shared between multiple processes, check
+ * to make sure that those processes are still issuing I/Os
+ * within the mean seek distance. If not, it may be time to
+ * break the queues apart again.
+ */
+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+ bfq_mark_bfqq_split_coop(bfqq);
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ if (bfqq->dispatched == 0)
+ /*
+ * Overloading budget_timeout field to store
+ * the time at which the queue remains with no
+ * backlog and no outstanding request; used by
+ * the weight-raising mechanism.
+ */
+ bfqq->budget_timeout = jiffies;
+
+ bfq_del_bfqq_busy(bfqd, bfqq, true);
+ } else {
+ bfq_requeue_bfqq(bfqd, bfqq);
+ /*
+ * Resort priority tree of potential close cooperators.
+ */
+ bfq_pos_tree_add_move(bfqd, bfqq);
+ }
+
+ /*
+ * All in-service entities must have been properly deactivated
+ * or requeued before executing the next function, which
+ * resets all in-service entites as no more in service.
+ */
+ __bfq_bfqd_reset_in_service(bfqd);
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ enum bfqq_expiration reason)
+{
+ struct request *next_rq;
+ int budget, min_budget;
+
+ min_budget = bfq_min_budget(bfqd);
+
+ if (bfqq->wr_coeff == 1)
+ budget = bfqq->max_budget;
+ else /*
+ * Use a constant, low budget for weight-raised queues,
+ * to help achieve a low latency. Keep it slightly higher
+ * than the minimum possible budget, to cause a little
+ * bit fewer expirations.
+ */
+ budget = 2 * min_budget;
+
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+ budget, bfq_min_budget(bfqd));
+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
+ switch (reason) {
+ /*
+ * Caveat: in all the following cases we trade latency
+ * for throughput.
+ */
+ case BFQQE_TOO_IDLE:
+ /*
+ * This is the only case where we may reduce
+ * the budget: if there is no request of the
+ * process still waiting for completion, then
+ * we assume (tentatively) that the timer has
+ * expired because the batch of requests of
+ * the process could have been served with a
+ * smaller budget. Hence, betting that
+ * process will behave in the same way when it
+ * becomes backlogged again, we reduce its
+ * next budget. As long as we guess right,
+ * this budget cut reduces the latency
+ * experienced by the process.
+ *
+ * However, if there are still outstanding
+ * requests, then the process may have not yet
+ * issued its next request just because it is
+ * still waiting for the completion of some of
+ * the still outstanding ones. So in this
+ * subcase we do not reduce its budget, on the
+ * contrary we increase it to possibly boost
+ * the throughput, as discussed in the
+ * comments to the BUDGET_TIMEOUT case.
+ */
+ if (bfqq->dispatched > 0) /* still outstanding reqs */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ else {
+ if (budget > 5 * min_budget)
+ budget -= 4 * min_budget;
+ else
+ budget = min_budget;
+ }
+ break;
+ case BFQQE_BUDGET_TIMEOUT:
+ /*
+ * We double the budget here because it gives
+ * the chance to boost the throughput if this
+ * is not a seeky process (and has bumped into
+ * this timeout because of, e.g., ZBR).
+ */
+ budget = min(budget * 2, bfqd->bfq_max_budget);
+ break;
+ case BFQQE_BUDGET_EXHAUSTED:
+ /*
+ * The process still has backlog, and did not
+ * let either the budget timeout or the disk
+ * idling timeout expire. Hence it is not
+ * seeky, has a short thinktime and may be
+ * happy with a higher budget too. So
+ * definitely increase the budget of this good
+ * candidate to boost the disk throughput.
+ */
+ budget = min(budget * 4, bfqd->bfq_max_budget);
+ break;
+ case BFQQE_NO_MORE_REQUESTS:
+ /*
+ * For queues that expire for this reason, it
+ * is particularly important to keep the
+ * budget close to the actual service they
+ * need. Doing so reduces the timestamp
+ * misalignment problem described in the
+ * comments in the body of
+ * __bfq_activate_entity. In fact, suppose
+ * that a queue systematically expires for
+ * BFQQE_NO_MORE_REQUESTS and presents a
+ * new request in time to enjoy timestamp
+ * back-shifting. The larger the budget of the
+ * queue is with respect to the service the
+ * queue actually requests in each service
+ * slot, the more times the queue can be
+ * reactivated with the same virtual finish
+ * time. It follows that, even if this finish
+ * time is pushed to the system virtual time
+ * to reduce the consequent timestamp
+ * misalignment, the queue unjustly enjoys for
+ * many re-activations a lower finish time
+ * than all newly activated queues.
+ *
+ * The service needed by bfqq is measured
+ * quite precisely by bfqq->entity.service.
+ * Since bfqq does not enjoy device idling,
+ * bfqq->entity.service is equal to the number
+ * of sectors that the process associated with
+ * bfqq requested to read/write before waiting
+ * for request completions, or blocking for
+ * other reasons.
+ */
+ budget = max_t(int, bfqq->entity.service, min_budget);
+ break;
+ default:
+ return;
+ }
+ } else if (!bfq_bfqq_sync(bfqq)) {
+ /*
+ * Async queues get always the maximum possible
+ * budget, as for them we do not care about latency
+ * (in addition, their ability to dispatch is limited
+ * by the charging factor).
+ */
+ budget = bfqd->bfq_max_budget;
+ }
+
+ bfqq->max_budget = budget;
+
+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+ !bfqd->bfq_user_max_budget)
+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+ /*
+ * If there is still backlog, then assign a new budget, making
+ * sure that it is large enough for the next request. Since
+ * the finish time of bfqq must be kept in sync with the
+ * budget, be sure to call __bfq_bfqq_expire() *after* this
+ * update.
+ *
+ * If there is no backlog, then no need to update the budget;
+ * it will be updated on the arrival of a new request.
+ */
+ next_rq = bfqq->next_rq;
+ if (next_rq)
+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+ bfq_serv_to_charge(next_rq, bfqq));
+
+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+ next_rq ? blk_rq_sectors(next_rq) : 0,
+ bfqq->entity.budget);
+}
+
+/*
+ * Return true if the process associated with bfqq is "slow". The slow
+ * flag is used, in addition to the budget timeout, to reduce the
+ * amount of service provided to seeky processes, and thus reduce
+ * their chances to lower the throughput. More details in the comments
+ * on the function bfq_bfqq_expire().
+ *
+ * An important observation is in order: as discussed in the comments
+ * on the function bfq_update_peak_rate(), with devices with internal
+ * queues, it is hard if ever possible to know when and for how long
+ * an I/O request is processed by the device (apart from the trivial
+ * I/O pattern where a new request is dispatched only after the
+ * previous one has been completed). This makes it hard to evaluate
+ * the real rate at which the I/O requests of each bfq_queue are
+ * served. In fact, for an I/O scheduler like BFQ, serving a
+ * bfq_queue means just dispatching its requests during its service
+ * slot (i.e., until the budget of the queue is exhausted, or the
+ * queue remains idle, or, finally, a timeout fires). But, during the
+ * service slot of a bfq_queue, around 100 ms at most, the device may
+ * be even still processing requests of bfq_queues served in previous
+ * service slots. On the opposite end, the requests of the in-service
+ * bfq_queue may be completed after the service slot of the queue
+ * finishes.
+ *
+ * Anyway, unless more sophisticated solutions are used
+ * (where possible), the sum of the sizes of the requests dispatched
+ * during the service slot of a bfq_queue is probably the only
+ * approximation available for the service received by the bfq_queue
+ * during its service slot. And this sum is the quantity used in this
+ * function to evaluate the I/O speed of a process.
+ */
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool compensate, enum bfqq_expiration reason,
+ unsigned long *delta_ms)
+{
+ ktime_t delta_ktime;
+ u32 delta_usecs;
+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
+
+ if (!bfq_bfqq_sync(bfqq))
+ return false;
+
+ if (compensate)
+ delta_ktime = bfqd->last_idling_start;
+ else
+ delta_ktime = ktime_get();
+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
+ delta_usecs = ktime_to_us(delta_ktime);
+
+ /* don't use too short time intervals */
+ if (delta_usecs < 1000) {
+ if (blk_queue_nonrot(bfqd->queue))
+ /*
+ * give same worst-case guarantees as idling
+ * for seeky
+ */
+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
+ else /* charge at least one seek */
+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
+
+ return slow;
+ }
+
+ *delta_ms = delta_usecs / USEC_PER_MSEC;
+
+ /*
+ * Use only long (> 20ms) intervals to filter out excessive
+ * spikes in service rate estimation.
+ */
+ if (delta_usecs > 20000) {
+ /*
+ * Caveat for rotational devices: processes doing I/O
+ * in the slower disk zones tend to be slow(er) even
+ * if not seeky. In this respect, the estimated peak
+ * rate is likely to be an average over the disk
+ * surface. Accordingly, to not be too harsh with
+ * unlucky processes, a process is deemed slow only if
+ * its rate has been lower than half of the estimated
+ * peak rate.
+ */
+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
+ }
+
+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
+
+ return slow;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ * HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ * for a while, then suddenly 'jump' by several units to recover the lost
+ * increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return max(bfqq->last_idle_bklogged +
+ HZ * bfqq->service_from_backlogged /
+ bfqd->bfq_wr_max_softrt_rate,
+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+}
+
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+ return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+ return jiffies - MAX_JIFFY_OFFSET;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ * If the process associated with bfqq does slow I/O (e.g., because it
+ * issues random requests), we charge bfqq with the time it has been
+ * in service instead of the service it has received (see
+ * bfq_bfqq_charge_time for details on how this goal is achieved). As
+ * a consequence, bfqq will typically get higher timestamps upon
+ * reactivation, and hence it will be rescheduled as if it had
+ * received more service than what it has actually received. In the
+ * end, bfqq receives less service in proportion to how slowly its
+ * associated process consumes its budgets (and hence how seriously it
+ * tends to lower the throughput). In addition, this time-charging
+ * strategy guarantees time fairness among slow processes. In
+ * contrast, if the process associated with bfqq is not slow, we
+ * charge bfqq exactly with the service it has received.
+ *
+ * Charging time to the first type of queues and the exact service to
+ * the other has the effect of using the WF2Q+ policy to schedule the
+ * former on a timeslice basis, without violating service domain
+ * guarantees among the latter.
+ */
+void bfq_bfqq_expire(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ bool compensate,
+ enum bfqq_expiration reason)
+{
+ bool slow;
+ unsigned long delta = 0;
+ struct bfq_entity *entity = &bfqq->entity;
+ int ref;
+
+ /*
+ * Check whether the process is slow (see bfq_bfqq_is_slow).
+ */
+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
+
+ /*
+ * Increase service_from_backlogged before next statement,
+ * because the possible next invocation of
+ * bfq_bfqq_charge_time would likely inflate
+ * entity->service. In contrast, service_from_backlogged must
+ * contain real service, to enable the soft real-time
+ * heuristic to correctly compute the bandwidth consumed by
+ * bfqq.
+ */
+ bfqq->service_from_backlogged += entity->service;
+
+ /*
+ * As above explained, charge slow (typically seeky) and
+ * timed-out queues with the time and not the service
+ * received, to favor sequential workloads.
+ *
+ * Processes doing I/O in the slower disk zones will tend to
+ * be slow(er) even if not seeky. Therefore, since the
+ * estimated peak rate is actually an average over the disk
+ * surface, these processes may timeout just for bad luck. To
+ * avoid punishing them, do not charge time to processes that
+ * succeeded in consuming at least 2/3 of their budget. This
+ * allows BFQ to preserve enough elasticity to still perform
+ * bandwidth, and not time, distribution with little unlucky
+ * or quasi-sequential processes.
+ */
+ if (bfqq->wr_coeff == 1 &&
+ (slow ||
+ (reason == BFQQE_BUDGET_TIMEOUT &&
+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
+ bfq_bfqq_charge_time(bfqd, bfqq, delta);
+
+ if (reason == BFQQE_TOO_IDLE &&
+ entity->service <= 2 * entity->budget / 10)
+ bfq_clear_bfqq_IO_bound(bfqq);
+
+ if (bfqd->low_latency && bfqq->wr_coeff == 1)
+ bfqq->last_wr_start_finish = jiffies;
+
+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ /*
+ * If we get here, and there are no outstanding
+ * requests, then the request pattern is isochronous
+ * (see the comments on the function
+ * bfq_bfqq_softrt_next_start()). Thus we can compute
+ * soft_rt_next_start. If, instead, the queue still
+ * has outstanding requests, then we have to wait for
+ * the completion of all the outstanding requests to
+ * discover whether the request pattern is actually
+ * isochronous.
+ */
+ if (bfqq->dispatched == 0)
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+ else {
+ /*
+ * The application is still waiting for the
+ * completion of one or more requests:
+ * prevent it from possibly being incorrectly
+ * deemed as soft real-time by setting its
+ * soft_rt_next_start to infinity. In fact,
+ * without this assignment, the application
+ * would be incorrectly deemed as soft
+ * real-time if:
+ * 1) it issued a new request before the
+ * completion of all its in-flight
+ * requests, and
+ * 2) at that time, its soft_rt_next_start
+ * happened to be in the past.
+ */
+ bfqq->soft_rt_next_start =
+ bfq_greatest_from_now();
+ /*
+ * Schedule an update of soft_rt_next_start to when
+ * the task may be discovered to be isochronous.
+ */
+ bfq_mark_bfqq_softrt_update(bfqq);
+ }
+ }
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason,
+ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
+
+ /*
+ * Increase, decrease or leave budget unchanged according to
+ * reason.
+ */
+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+ ref = bfqq->ref;
+ __bfq_bfqq_expire(bfqd, bfqq);
+
+ /* mark bfqq as waiting a request only if a bic still points to it */
+ if (ref > 1 && !bfq_bfqq_busy(bfqq) &&
+ reason != BFQQE_BUDGET_TIMEOUT &&
+ reason != BFQQE_BUDGET_EXHAUSTED)
+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+ return time_is_before_eq_jiffies(bfqq->budget_timeout);
+}
+
+/*
+ * If we expire a queue that is actively waiting (i.e., with the
+ * device idled) for the arrival of a new request, then we may incur
+ * the timestamp misalignment problem described in the body of the
+ * function __bfq_activate_entity. Hence we return true only if this
+ * condition does not hold, or if the queue is slow enough to deserve
+ * only to be kicked off for preserving a high throughput.
+ */
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "may_budget_timeout: wait_request %d left %d timeout %d",
+ bfq_bfqq_wait_request(bfqq),
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3,
+ bfq_bfqq_budget_timeout(bfqq));
+
+ return (!bfq_bfqq_wait_request(bfqq) ||
+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)
+ &&
+ bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for the queue. As a consequence, since
+ * device idling plays a critical role in both throughput boosting and
+ * service guarantees, the return value of this function plays a
+ * critical role in both these aspects as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * individually while introducing the variables.
+ */
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ bool idling_boosts_thr, idling_boosts_thr_without_issues,
+ idling_needed_for_service_guarantees,
+ asymmetric_scenario;
+
+ if (bfqd->strict_guarantees)
+ return true;
+
+ /*
+ * The next variable takes into account the cases where idling
+ * boosts the throughput.
+ *
+ * The value of the variable is computed considering, first, that
+ * idling is virtually always beneficial for the throughput if:
+ * (a) the device is not NCQ-capable, or
+ * (b) regardless of the presence of NCQ, the device is rotational
+ * and the request pattern for bfqq is I/O-bound and sequential.
+ *
+ * Secondly, and in contrast to the above item (b), idling an
+ * NCQ-capable flash-based device would not boost the
+ * throughput even with sequential I/O; rather it would lower
+ * the throughput in proportion to how fast the device
+ * is. Accordingly, the next variable is true if any of the
+ * above conditions (a) and (b) is true, and, in particular,
+ * happens to be false if bfqd is an NCQ-capable flash-based
+ * device.
+ */
+ idling_boosts_thr = !bfqd->hw_tag ||
+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+ bfq_bfqq_idle_window(bfqq));
+
+ /*
+ * The value of the next variable,
+ * idling_boosts_thr_without_issues, is equal to that of
+ * idling_boosts_thr, unless a special case holds. In this
+ * special case, described below, idling may cause problems to
+ * weight-raised queues.
+ *
+ * When the request pool is saturated (e.g., in the presence
+ * of write hogs), if the processes associated with
+ * non-weight-raised queues ask for requests at a lower rate,
+ * then processes associated with weight-raised queues have a
+ * higher probability to get a request from the pool
+ * immediately (or at least soon) when they need one. Thus
+ * they have a higher probability to actually get a fraction
+ * of the device throughput proportional to their high
+ * weight. This is especially true with NCQ-capable drives,
+ * which enqueue several requests in advance, and further
+ * reorder internally-queued requests.
+ *
+ * For this reason, we force to false the value of
+ * idling_boosts_thr_without_issues if there are weight-raised
+ * busy queues. In this case, and if bfqq is not weight-raised,
+ * this guarantees that the device is not idled for bfqq (if,
+ * instead, bfqq is weight-raised, then idling will be
+ * guaranteed by another variable, see below). Combined with
+ * the timestamping rules of BFQ (see [1] for details), this
+ * behavior causes bfqq, and hence any sync non-weight-raised
+ * queue, to get a lower number of requests served, and thus
+ * to ask for a lower number of requests from the request
+ * pool, before the busy weight-raised queues get served
+ * again. This often mitigates starvation problems in the
+ * presence of heavy write workloads and NCQ, thereby
+ * guaranteeing a higher application and system responsiveness
+ * in these hostile scenarios.
+ */
+ idling_boosts_thr_without_issues = idling_boosts_thr &&
+ bfqd->wr_busy_queues == 0;
+
+ /*
+ * There is then a case where idling must be performed not
+ * for throughput concerns, but to preserve service
+ * guarantees.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired
+ * device-throughput distribution only in a completely
+ * symmetric scenario where:
+ * (i) each of these processes must get the same throughput as
+ * the others;
+ * (ii) all these processes have the same I/O pattern
+ (either sequential or random).
+ * In fact, in such a scenario, the drive will tend to treat
+ * the requests of each of these processes in about the same
+ * way as the requests of the others, and thus to provide
+ * each of these processes with about the same throughput
+ * (which is exactly the desired throughput distribution). In
+ * contrast, in any asymmetric scenario, device idling is
+ * certainly needed to guarantee that bfqq receives its
+ * assigned fraction of the device throughput (see [1] for
+ * details).
+ *
+ * We address this issue by controlling, actually, only the
+ * symmetry sub-condition (i), i.e., provided that
+ * sub-condition (i) holds, idling is not performed,
+ * regardless of whether sub-condition (ii) holds. In other
+ * words, only if sub-condition (i) holds, then idling is
+ * allowed, and the device tends to be prevented from queueing
+ * many requests, possibly of several processes. The reason
+ * for not controlling also sub-condition (ii) is that we
+ * exploit preemption to preserve guarantees in case of
+ * symmetric scenarios, even if (ii) does not hold, as
+ * explained in the next two paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness. The motivation for using
+ * preemption instead of idling is that, by not idling,
+ * service guarantees are preserved without minimally
+ * sacrificing throughput. In other words, both a high
+ * throughput and its desired distribution are obtained.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * On the other hand, device idling is performed, and thus
+ * pure sector-domain guarantees are provided, for the
+ * following queues, which are likely to need stronger
+ * throughput guarantees: weight-raised queues, and queues
+ * with a higher weight than other queues. When such queues
+ * are active, sub-condition (i) is false, which triggers
+ * device idling.
+ *
+ * According to the above considerations, the next variable is
+ * true (only) if sub-condition (i) holds. To compute the
+ * value of this variable, we not only use the return value of
+ * the function bfq_symmetric_scenario(), but also check
+ * whether bfqq is being weight-raised, because
+ * bfq_symmetric_scenario() does not take into account also
+ * weight-raised queues (see comments on
+ * bfq_weights_tree_add()).
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the
+ * following unlucky scenario: if idling is (correctly)
+ * disabled in a time period during which all symmetry
+ * sub-conditions hold, and hence the device is allowed to
+ * enqueue many requests, but at some later point in time some
+ * sub-condition stops to hold, then it may become impossible
+ * to let requests be served in the desired order until all
+ * the requests already queued in the device have been served.
+ */
+ asymmetric_scenario = bfqq->wr_coeff > 1 ||
+ !bfq_symmetric_scenario(bfqd);
+
+ /*
+ * Finally, there is a case where maximizing throughput is the
+ * best choice even if it may cause unfairness toward
+ * bfqq. Such a case is when bfqq became active in a burst of
+ * queue activations. Queues that became active during a large
+ * burst benefit only from throughput, as discussed in the
+ * comments on bfq_handle_burst. Thus, if bfqq became active
+ * in a burst and not idling the device maximizes throughput,
+ * then the device must no be idled, because not idling the
+ * device provides bfqq and all other queues in the burst with
+ * maximum benefit. Combining this and the above case, we can
+ * now establish when idling is actually needed to preserve
+ * service guarantees.
+ */
+ idling_needed_for_service_guarantees =
+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+
+ /*
+ * We have now all the components we need to compute the return
+ * value of the function, which is true only if both the following
+ * conditions hold:
+ * 1) bfqq is sync, because idling make sense only for sync queues;
+ * 2) idling either boosts the throughput (without issues), or
+ * is necessary to preserve service guarantees.
+ */
+ return bfq_bfqq_sync(bfqq) &&
+ (idling_boosts_thr_without_issues ||
+ idling_needed_for_service_guarantees);
+}
+
+/*
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the device must be idled to wait for the possible arrival of a new
+ * request for the queue.
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * returns true.
+ */
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+ bfq_bfqq_may_idle(bfqq);
+}
+
+/*
+ * Select a queue for service. If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+ struct bfq_queue *bfqq;
+ struct request *next_rq;
+ enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
+
+ bfqq = bfqd->in_service_queue;
+ if (!bfqq)
+ goto new_queue;
+
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+ if (bfq_may_expire_for_budg_timeout(bfqq) &&
+ !bfq_bfqq_wait_request(bfqq) &&
+ !bfq_bfqq_must_idle(bfqq))
+ goto expire;
+
+check_queue:
+ /*
+ * This loop is rarely executed more than once. Even when it
+ * happens, it is much more convenient to re-execute this loop
+ * than to return NULL and trigger a new dispatch to get a
+ * request served.
+ */
+ next_rq = bfqq->next_rq;
+ /*
+ * If bfqq has requests queued and it has enough budget left to
+ * serve them, keep the queue, otherwise expire it.
+ */
+ if (next_rq) {
+ if (bfq_serv_to_charge(next_rq, bfqq) >
+ bfq_bfqq_budget_left(bfqq)) {
+ /*
+ * Expire the queue for budget exhaustion,
+ * which makes sure that the next budget is
+ * enough to serve the next request, even if
+ * it comes from the fifo expired path.
+ */
+ reason = BFQQE_BUDGET_EXHAUSTED;
+ goto expire;
+ } else {
+ /*
+ * The idle timer may be pending because we may
+ * not disable disk idling even when a new request
+ * arrives.
+ */
+ if (bfq_bfqq_wait_request(bfqq)) {
+ /*
+ * If we get here: 1) at least a new request
+ * has arrived but we have not disabled the
+ * timer because the request was too small,
+ * 2) then the block layer has unplugged
+ * the device, causing the dispatch to be
+ * invoked.
+ *
+ * Since the device is unplugged, now the
+ * requests are probably large enough to
+ * provide a reasonable throughput.
+ * So we disable idling.
+ */
+ bfq_clear_bfqq_wait_request(bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
+ }
+ goto keep_queue;
+ }
+ }
+
+ /*
+ * No requests pending. However, if the in-service queue is idling
+ * for a new request, or has requests waiting for a completion and
+ * may idle after their completion, then keep it anyway.
+ */
+ if (bfq_bfqq_wait_request(bfqq) ||
+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+ bfqq = NULL;
+ goto keep_queue;
+ }
+
+ reason = BFQQE_NO_MORE_REQUESTS;
+expire:
+ bfq_bfqq_expire(bfqd, bfqq, false, reason);
+new_queue:
+ bfqq = bfq_set_in_service_queue(bfqd);
+ if (bfqq) {
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue");
+ goto check_queue;
+ }
+keep_queue:
+ if (bfqq)
+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue");
+ else
+ bfq_log(bfqd, "select_queue: no queue returned");
+
+ return bfqq;
+}
+
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+ bfq_log_bfqq(bfqd, bfqq,
+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+ jiffies_to_msecs(bfqq->wr_cur_max_time),
+ bfqq->wr_coeff,
+ bfqq->entity.weight, bfqq->entity.orig_weight);
+
+ if (entity->prio_changed)
+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+ /*
+ * If the queue was activated in a burst, or too much
+ * time has elapsed from the beginning of this
+ * weight-raising period, then end weight raising.
+ */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bfq_bfqq_end_wr(bfqq);
+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+ bfqq->wr_cur_max_time)) {
+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+ bfq_wr_duration(bfqd)))
+ bfq_bfqq_end_wr(bfqq);
+ else {
+ /* switch back to interactive wr */
+ bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+ bfqq->last_wr_start_finish =
+ bfqq->wr_start_at_switch_to_srt;
+ bfqq->entity.prio_changed = 1;
+ }
+ }
+ }
+ /* Update weight both if it must be raised and if it must be lowered */
+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+ __bfq_entity_update_weight_prio(
+ bfq_entity_service_tree(entity),
+ entity);
+}
+
+/*
+ * Dispatch next request from bfqq.
+ */
+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct request *rq = bfqq->next_rq;
+ unsigned long service_to_charge;
+
+ service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+ bfq_bfqq_served(bfqq, service_to_charge);
+
+ bfq_dispatch_remove(bfqd->queue, rq);
+
+ /*
+ * If weight raising has to terminate for bfqq, then next
+ * function causes an immediate update of bfqq's weight,
+ * without waiting for next activation. As a consequence, on
+ * expiration, bfqq will be timestamped as if has never been
+ * weight-raised during this service slot, even if it has
+ * received part or even most of the service as a
+ * weight-raised queue. This inflates bfqq's timestamps, which
+ * is beneficial, as bfqq is then more willing to leave the
+ * device immediately to possible other weight-raised queues.
+ */
+ bfq_update_wr_data(bfqd, bfqq);
+
+ /*
+ * Expire bfqq, pretending that its budget expired, if bfqq
+ * belongs to CLASS_IDLE and other queues are waiting for
+ * service.
+ */
+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
+ goto expire;
+
+ return rq;
+
+expire:
+ bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+ return rq;
+}
+
+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+
+ /*
+ * Avoiding lock: a race on bfqd->busy_queues should cause at
+ * most a call to dispatch for nothing
+ */
+ return !list_empty_careful(&bfqd->dispatch) ||
+ bfqd->busy_queues > 0;
+}
+
+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct request *rq = NULL;
+ struct bfq_queue *bfqq = NULL;
+
+ if (!list_empty(&bfqd->dispatch)) {
+ rq = list_first_entry(&bfqd->dispatch, struct request,
+ queuelist);
+ list_del_init(&rq->queuelist);
+
+ bfqq = RQ_BFQQ(rq);
+
+ if (bfqq) {
+ /*
+ * Increment counters here, because this
+ * dispatch does not follow the standard
+ * dispatch flow (where counters are
+ * incremented)
+ */
+ bfqq->dispatched++;
+
+ goto inc_in_driver_start_rq;
+ }
+
+ /*
+ * We exploit the put_rq_private hook to decrement
+ * rq_in_driver, but put_rq_private will not be
+ * invoked on this request. So, to avoid unbalance,
+ * just start this request, without incrementing
+ * rq_in_driver. As a negative consequence,
+ * rq_in_driver is deceptively lower than it should be
+ * while this request is in service. This may cause
+ * bfq_schedule_dispatch to be invoked uselessly.
+ *
+ * As for implementing an exact solution, the
+ * put_request hook, if defined, is probably invoked
+ * also on this request. So, by exploiting this hook,
+ * we could 1) increment rq_in_driver here, and 2)
+ * decrement it in put_request. Such a solution would
+ * let the value of the counter be always accurate,
+ * but it would entail using an extra interface
+ * function. This cost seems higher than the benefit,
+ * being the frequency of non-elevator-private
+ * requests very low.
+ */
+ goto start_rq;
+ }
+
+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+
+ if (bfqd->busy_queues == 0)
+ goto exit;
+
+ /*
+ * Force device to serve one request at a time if
+ * strict_guarantees is true. Forcing this service scheme is
+ * currently the ONLY way to guarantee that the request
+ * service order enforced by the scheduler is respected by a
+ * queueing device. Otherwise the device is free even to make
+ * some unlucky request wait for as long as the device
+ * wishes.
+ *
+ * Of course, serving one request at at time may cause loss of
+ * throughput.
+ */
+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+ goto exit;
+
+ bfqq = bfq_select_queue(bfqd);
+ if (!bfqq)
+ goto exit;
+
+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);
+
+ if (rq) {
+inc_in_driver_start_rq:
+ bfqd->rq_in_driver++;
+start_rq:
+ rq->rq_flags |= RQF_STARTED;
+ }
+exit:
+ return rq;
+}
+
+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct request *rq;
+
+ spin_lock_irq(&bfqd->lock);
+
+ rq = __bfq_dispatch_request(hctx);
+ spin_unlock_irq(&bfqd->lock);
+
+ return rq;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits. Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Scheduler lock must be held here. Recall not to use bfqq after calling
+ * this function on it.
+ */
+void bfq_put_queue(struct bfq_queue *bfqq)
+{
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
+ if (bfqq->bfqd)
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
+ bfqq, bfqq->ref);
+
+ bfqq->ref--;
+ if (bfqq->ref)
+ return;
+
+ if (bfq_bfqq_sync(bfqq))
+ /*
+ * The fact that this queue is being destroyed does not
+ * invalidate the fact that this queue may have been
+ * activated during the current burst. As a consequence,
+ * although the queue does not exist anymore, and hence
+ * needs to be removed from the burst list if there,
+ * the burst size has not to be decremented.
+ */
+ hlist_del_init(&bfqq->burst_list_node);
+
+ kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ bfqg_put(bfqg);
+#endif
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+ struct bfq_queue *__bfqq, *next;
+
+ /*
+ * If this queue was scheduled to merge with another queue, be
+ * sure to drop the reference taken on that queue (and others in
+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+ */
+ __bfqq = bfqq->new_bfqq;
+ while (__bfqq) {
+ if (__bfqq == bfqq)
+ break;
+ next = __bfqq->new_bfqq;
+ bfq_put_queue(__bfqq);
+ __bfqq = next;
+ }
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ if (bfqq == bfqd->in_service_queue) {
+ __bfq_bfqq_expire(bfqd, bfqq);
+ bfq_schedule_dispatch(bfqd);
+ }
+
+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
+
+ bfq_put_cooperator(bfqq);
+
+ bfq_put_queue(bfqq); /* release process reference */
+}
+
+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+ struct bfq_data *bfqd;
+
+ if (bfqq)
+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
+
+ if (bfqq && bfqd) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+ bfq_exit_bfqq(bfqd, bfqq);
+ bic_set_bfqq(bic, NULL, is_sync);
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ }
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+ struct bfq_io_cq *bic = icq_to_bic(icq);
+
+ bfq_exit_icq_bfqq(bic, true);
+ bfq_exit_icq_bfqq(bic, false);
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void
+bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+ struct task_struct *tsk = current;
+ int ioprio_class;
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ if (!bfqd)
+ return;
+
+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+ switch (ioprio_class) {
+ default:
+ dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
+ "bfq: bad prio class %d\n", ioprio_class);
+ case IOPRIO_CLASS_NONE:
+ /*
+ * No prio set, inherit CPU scheduling settings.
+ */
+ bfqq->new_ioprio = task_nice_ioprio(tsk);
+ bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+ break;
+ case IOPRIO_CLASS_RT:
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+ break;
+ case IOPRIO_CLASS_BE:
+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+ break;
+ case IOPRIO_CLASS_IDLE:
+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+ bfqq->new_ioprio = 7;
+ bfq_clear_bfqq_idle_window(bfqq);
+ break;
+ }
+
+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
+ bfqq->new_ioprio);
+ bfqq->new_ioprio = IOPRIO_BE_NR;
+ }
+
+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+ bfqq->entity.prio_changed = 1;
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ struct bio *bio, bool is_sync,
+ struct bfq_io_cq *bic);
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+{
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ struct bfq_queue *bfqq;
+ int ioprio = bic->icq.ioc->ioprio;
+
+ /*
+ * This condition may trigger on a newly created bic, be sure to
+ * drop the lock before returning.
+ */
+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+ return;
+
+ bic->ioprio = ioprio;
+
+ bfqq = bic_to_bfqq(bic, false);
+ if (bfqq) {
+ /* release process reference on this queue */
+ bfq_put_queue(bfqq);
+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+ bic_set_bfqq(bic, bfqq, false);
+ }
+
+ bfqq = bic_to_bfqq(bic, true);
+ if (bfqq)
+ bfq_set_next_ioprio_data(bfqq, bic);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+ RB_CLEAR_NODE(&bfqq->entity.rb_node);
+ INIT_LIST_HEAD(&bfqq->fifo);
+ INIT_HLIST_NODE(&bfqq->burst_list_node);
+
+ bfqq->ref = 0;
+ bfqq->bfqd = bfqd;
+
+ if (bic)
+ bfq_set_next_ioprio_data(bfqq, bic);
+
+ if (is_sync) {
+ if (!bfq_class_idle(bfqq))
+ bfq_mark_bfqq_idle_window(bfqq);
+ bfq_mark_bfqq_sync(bfqq);
+ bfq_mark_bfqq_just_created(bfqq);
+ } else
+ bfq_clear_bfqq_sync(bfqq);
+
+ /* set end request to minus infinity from now */
+ bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+
+ bfq_mark_bfqq_IO_bound(bfqq);
+
+ bfqq->pid = pid;
+
+ /* Tentative initial value to trade off between thr and lat */
+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+ bfqq->budget_timeout = bfq_smallest_from_now();
+
+ bfqq->wr_coeff = 1;
+ bfqq->last_wr_start_finish = jiffies;
+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
+ bfqq->split_time = bfq_smallest_from_now();
+
+ /*
+ * Set to the value for which bfqq will not be deemed as
+ * soft rt when it becomes backlogged.
+ */
+ bfqq->soft_rt_next_start = bfq_greatest_from_now();
+
+ /* first request is almost certainly seeky */
+ bfqq->seek_history = 1;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ int ioprio_class, int ioprio)
+{
+ switch (ioprio_class) {
+ case IOPRIO_CLASS_RT:
+ return &bfqg->async_bfqq[0][ioprio];
+ case IOPRIO_CLASS_NONE:
+ ioprio = IOPRIO_NORM;
+ /* fall through */
+ case IOPRIO_CLASS_BE:
+ return &bfqg->async_bfqq[1][ioprio];
+ case IOPRIO_CLASS_IDLE:
+ return &bfqg->async_idle_bfqq;
+ default:
+ return NULL;
+ }
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+ struct bio *bio, bool is_sync,
+ struct bfq_io_cq *bic)
+{
+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+ struct bfq_queue **async_bfqq = NULL;
+ struct bfq_queue *bfqq;
+ struct bfq_group *bfqg;
+
+ rcu_read_lock();
+
+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+ if (!bfqg) {
+ bfqq = &bfqd->oom_bfqq;
+ goto out;
+ }
+
+ if (!is_sync) {
+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+ ioprio);
+ bfqq = *async_bfqq;
+ if (bfqq)
+ goto out;
+ }
+
+ bfqq = kmem_cache_alloc_node(bfq_pool,
+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
+ bfqd->queue->node);
+
+ if (bfqq) {
+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+ is_sync);
+ bfq_init_entity(&bfqq->entity, bfqg);
+ bfq_log_bfqq(bfqd, bfqq, "allocated");
+ } else {
+ bfqq = &bfqd->oom_bfqq;
+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+ goto out;
+ }
+
+ /*
+ * Pin the queue now that it's allocated, scheduler exit will
+ * prune it.
+ */
+ if (async_bfqq) {
+ bfqq->ref++; /*
+ * Extra group reference, w.r.t. sync
+ * queue. This extra reference is removed
+ * only if bfqq->bfqg disappears, to
+ * guarantee that this queue is not freed
+ * until its group goes away.
+ */
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+ bfqq, bfqq->ref);
+ *async_bfqq = bfqq;
+ }
+
+out:
+ bfqq->ref++; /* get a process reference to this queue */
+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+ rcu_read_unlock();
+ return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ struct bfq_ttime *ttime = &bfqq->ttime;
+ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+
+ elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
+
+ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
+ ttime->ttime_samples);
+}
+
+static void
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct request *rq)
+{
+ bfqq->seek_history <<= 1;
+ bfqq->seek_history |=
+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR &&
+ (!blk_queue_nonrot(bfqd->queue) ||
+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT);
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic)
+{
+ int enable_idle;
+
+ /* Don't idle for async or idle io prio class. */
+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+ return;
+
+ /* Idle window just restored, statistics are meaningless. */
+ if (time_is_after_eq_jiffies(bfqq->split_time +
+ bfqd->bfq_wr_min_idle_time))
+ return;
+
+ enable_idle = bfq_bfqq_idle_window(bfqq);
+
+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
+ bfqd->bfq_slice_idle == 0 ||
+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+ bfqq->wr_coeff == 1))
+ enable_idle = 0;
+ else if (bfq_sample_valid(bfqq->ttime.ttime_samples)) {
+ if (bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+ bfqq->wr_coeff == 1)
+ enable_idle = 0;
+ else
+ enable_idle = 1;
+ }
+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+ enable_idle);
+
+ if (enable_idle)
+ bfq_mark_bfqq_idle_window(bfqq);
+ else
+ bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq. Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct request *rq)
+{
+ struct bfq_io_cq *bic = RQ_BIC(rq);
+
+ if (rq->cmd_flags & REQ_META)
+ bfqq->meta_pending++;
+
+ bfq_update_io_thinktime(bfqd, bfqq);
+ bfq_update_io_seektime(bfqd, bfqq, rq);
+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+ !BFQQ_SEEKY(bfqq))
+ bfq_update_idle_window(bfqd, bfqq, bic);
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "rq_enqueued: idle_window=%d (seeky %d)",
+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
+
+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+ blk_rq_sectors(rq) < 32;
+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+ /*
+ * There is just this request queued: if the request
+ * is small and the queue is not to be expired, then
+ * just exit.
+ *
+ * In this way, if the device is being idled to wait
+ * for a new request from the in-service queue, we
+ * avoid unplugging the device and committing the
+ * device to serve just a small request. On the
+ * contrary, we wait for the block layer to decide
+ * when to unplug the device: hopefully, new requests
+ * will be merged to this one quickly, then the device
+ * will be unplugged and larger requests will be
+ * dispatched.
+ */
+ if (small_req && !budget_timeout)
+ return;
+
+ /*
+ * A large enough request arrived, or the queue is to
+ * be expired: in both cases disk idling is to be
+ * stopped, so clear wait_request flag and reset
+ * timer.
+ */
+ bfq_clear_bfqq_wait_request(bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqg_stats_update_idle_time(bfqq_group(bfqq));
+
+ /*
+ * The queue is not empty, because a new request just
+ * arrived. Hence we can safely expire the queue, in
+ * case of budget timeout, without risking that the
+ * timestamps of the queue are not updated correctly.
+ * See [1] for more details.
+ */
+ if (budget_timeout)
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQQE_BUDGET_TIMEOUT);
+ }
+}
+
+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq),
+ *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+
+ if (new_bfqq) {
+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
+ /*
+ * Release the request's reference to the old bfqq
+ * and make sure one is taken to the shared queue.
+ */
+ new_bfqq->allocated++;
+ bfqq->allocated--;
+ new_bfqq->ref++;
+ bfq_clear_bfqq_just_created(bfqq);
+ /*
+ * If the bic associated with the process
+ * issuing this request still points to bfqq
+ * (and thus has not been already redirected
+ * to new_bfqq or even some other bfq_queue),
+ * then complete the merge and redirect it to
+ * new_bfqq.
+ */
+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+ bfqq, new_bfqq);
+ /*
+ * rq is about to be enqueued into new_bfqq,
+ * release rq reference on bfqq
+ */
+ bfq_put_queue(bfqq);
+ rq->elv.priv[1] = new_bfqq;
+ bfqq = new_bfqq;
+ }
+
+ bfq_add_request(rq);
+
+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+ list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+ bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ spin_lock_irq(&bfqd->lock);
+ if (blk_mq_sched_try_insert_merge(q, rq)) {
+ spin_unlock_irq(&bfqd->lock);
+ return;
+ }
+
+ spin_unlock_irq(&bfqd->lock);
+
+ blk_mq_sched_request_inserted(rq);
+
+ spin_lock_irq(&bfqd->lock);
+ if (at_head || blk_rq_is_passthrough(rq)) {
+ if (at_head)
+ list_add(&rq->queuelist, &bfqd->dispatch);
+ else
+ list_add_tail(&rq->queuelist, &bfqd->dispatch);
+ } else {
+ __bfq_insert_request(bfqd, rq);
+
+ if (rq_mergeable(rq)) {
+ elv_rqhash_add(q, rq);
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
+ }
+
+ spin_unlock_irq(&bfqd->lock);
+}
+
+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list, bool at_head)
+{
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ bfq_insert_request(hctx, rq, at_head);
+ }
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+ bfqd->rq_in_driver);
+
+ if (bfqd->hw_tag == 1)
+ return;
+
+ /*
+ * This sample is valid if the number of outstanding requests
+ * is large enough to allow a queueing behavior. Note that the
+ * sum is not exact, as it's not taking into account deactivated
+ * requests.
+ */
+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+ return;
+
+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+ return;
+
+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+ bfqd->max_rq_in_driver = 0;
+ bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
+{
+ u64 now_ns;
+ u32 delta_us;
+
+ bfq_update_hw_tag(bfqd);
+
+ bfqd->rq_in_driver--;
+ bfqq->dispatched--;
+
+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+ /*
+ * Set budget_timeout (which we overload to store the
+ * time at which the queue remains with no backlog and
+ * no outstanding request; used by the weight-raising
+ * mechanism).
+ */
+ bfqq->budget_timeout = jiffies;
+
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+ }
+
+ now_ns = ktime_get_ns();
+
+ bfqq->ttime.last_end_request = now_ns;
+
+ /*
+ * Using us instead of ns, to get a reasonable precision in
+ * computing rate in next check.
+ */
+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
+
+ /*
+ * If the request took rather long to complete, and, according
+ * to the maximum request size recorded, this completion latency
+ * implies that the request was certainly served at a very low
+ * rate (less than 1M sectors/sec), then the whole observation
+ * interval that lasts up to this time instant cannot be a
+ * valid time interval for computing a new peak rate. Invoke
+ * bfq_update_rate_reset to have the following three steps
+ * taken:
+ * - close the observation interval at the last (previous)
+ * request dispatch or completion
+ * - compute rate, if possible, for that observation interval
+ * - reset to zero samples, which will trigger a proper
+ * re-initialization of the observation interval on next
+ * dispatch
+ */
+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
+ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
+ 1UL<<(BFQ_RATE_SHIFT - 10))
+ bfq_update_rate_reset(bfqd, NULL);
+ bfqd->last_completion = now_ns;
+
+ /*
+ * If we are waiting to discover whether the request pattern
+ * of the task associated with the queue is actually
+ * isochronous, and both requisites for this condition to hold
+ * are now satisfied, then compute soft_rt_next_start (see the
+ * comments on the function bfq_bfqq_softrt_next_start()). We
+ * schedule this delayed check when bfqq expires, if it still
+ * has in-flight requests.
+ */
+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+ RB_EMPTY_ROOT(&bfqq->sort_list))
+ bfqq->soft_rt_next_start =
+ bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+ /*
+ * If this is the in-service queue, check if it needs to be expired,
+ * or if we want to idle in case it has no pending requests.
+ */
+ if (bfqd->in_service_queue == bfqq) {
+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
+ bfq_arm_slice_timer(bfqd);
+ return;
+ } else if (bfq_may_expire_for_budg_timeout(bfqq))
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQQE_BUDGET_TIMEOUT);
+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ (bfqq->dispatched == 0 ||
+ !bfq_bfqq_may_idle(bfqq)))
+ bfq_bfqq_expire(bfqd, bfqq, false,
+ BFQQE_NO_MORE_REQUESTS);
+ }
+}
+
+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
+{
+ bfqq->allocated--;
+
+ bfq_put_queue(bfqq);
+}
+
+static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
+{
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ if (rq->rq_flags & RQF_STARTED)
+ bfqg_stats_update_completion(bfqq_group(bfqq),
+ rq_start_time_ns(rq),
+ rq_io_start_time_ns(rq),
+ rq->cmd_flags);
+
+ if (likely(rq->rq_flags & RQF_STARTED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+
+ bfq_completed_request(bfqq, bfqd);
+ bfq_put_rq_priv_body(bfqq);
+
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ } else {
+ /*
+ * Request rq may be still/already in the scheduler,
+ * in which case we need to remove it. And we cannot
+ * defer such a check and removal, to avoid
+ * inconsistencies in the time interval from the end
+ * of this function to the start of the deferred work.
+ * This situation seems to occur only in process
+ * context, as a consequence of a merge. In the
+ * current version of the code, this implies that the
+ * lock is held.
+ */
+
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ bfq_remove_request(q, rq);
+ bfq_put_rq_priv_body(bfqq);
+ }
+
+ rq->elv.priv[0] = NULL;
+ rq->elv.priv[1] = NULL;
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to that bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+
+ if (bfqq_process_refs(bfqq) == 1) {
+ bfqq->pid = current->pid;
+ bfq_clear_bfqq_coop(bfqq);
+ bfq_clear_bfqq_split_coop(bfqq);
+ return bfqq;
+ }
+
+ bic_set_bfqq(bic, NULL, 1);
+
+ bfq_put_cooperator(bfqq);
+
+ bfq_put_queue(bfqq);
+ return NULL;
+}
+
+static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bio *bio,
+ bool split, bool is_sync,
+ bool *new_queue)
+{
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+
+ if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
+ return bfqq;
+
+ if (new_queue)
+ *new_queue = true;
+
+ if (bfqq)
+ bfq_put_queue(bfqq);
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+
+ bic_set_bfqq(bic, bfqq, is_sync);
+ if (split && is_sync) {
+ if ((bic->was_in_burst_list && bfqd->large_burst) ||
+ bic->saved_in_large_burst)
+ bfq_mark_bfqq_in_large_burst(bfqq);
+ else {
+ bfq_clear_bfqq_in_large_burst(bfqq);
+ if (bic->was_in_burst_list)
+ hlist_add_head(&bfqq->burst_list_node,
+ &bfqd->burst_list);
+ }
+ bfqq->split_time = jiffies;
+ }
+
+ return bfqq;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+ const int is_sync = rq_is_sync(rq);
+ struct bfq_queue *bfqq;
+ bool new_queue = false;
+ bool split = false;
+
+ spin_lock_irq(&bfqd->lock);
+
+ if (!bic)
+ goto queue_fail;
+
+ bfq_check_ioprio_change(bic, bio);
+
+ bfq_bic_update_cgroup(bic, bio);
+
+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync,
+ &new_queue);
+
+ if (likely(!new_queue)) {
+ /* If the queue was seeky for too long, break it apart. */
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+ /* Update bic before losing reference to bfqq */
+ if (bfq_bfqq_in_large_burst(bfqq))
+ bic->saved_in_large_burst = true;
+
+ bfqq = bfq_split_bfqq(bic, bfqq);
+ split = true;
+
+ if (!bfqq)
+ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
+ true, is_sync,
+ NULL);
+ }
+ }
+
+ bfqq->allocated++;
+ bfqq->ref++;
+ bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
+ rq, bfqq, bfqq->ref);
+
+ rq->elv.priv[0] = bic;
+ rq->elv.priv[1] = bfqq;
+
+ /*
+ * If a bfq_queue has only one process reference, it is owned
+ * by only this bic: we can then set bfqq->bic = bic. in
+ * addition, if the queue has also just been split, we have to
+ * resume its state.
+ */
+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+ bfqq->bic = bic;
+ if (split) {
+ /*
+ * The queue has just been split from a shared
+ * queue: restore the idle window and the
+ * possible weight raising period.
+ */
+ bfq_bfqq_resume_state(bfqq, bic);
+ }
+ }
+
+ if (unlikely(bfq_bfqq_just_created(bfqq)))
+ bfq_handle_burst(bfqd, bfqq);
+
+ spin_unlock_irq(&bfqd->lock);
+
+ return 0;
+
+queue_fail:
+ spin_unlock_irq(&bfqd->lock);
+
+ return 1;
+}
+
+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ enum bfqq_expiration reason;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+ bfq_clear_bfqq_wait_request(bfqq);
+
+ if (bfqq != bfqd->in_service_queue) {
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ return;
+ }
+
+ if (bfq_bfqq_budget_timeout(bfqq))
+ /*
+ * Also here the queue can be safely expired
+ * for budget timeout without wasting
+ * guarantees
+ */
+ reason = BFQQE_BUDGET_TIMEOUT;
+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+ /*
+ * The queue may not be empty upon timer expiration,
+ * because we may not disable the timer when the
+ * first request of the in-service queue arrives
+ * during disk idling.
+ */
+ reason = BFQQE_TOO_IDLE;
+ else
+ goto schedule_dispatch;
+
+ bfq_bfqq_expire(bfqd, bfqq, true, reason);
+
+schedule_dispatch:
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ bfq_schedule_dispatch(bfqd);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
+{
+ struct bfq_data *bfqd = container_of(timer, struct bfq_data,
+ idle_slice_timer);
+ struct bfq_queue *bfqq = bfqd->in_service_queue;
+
+ /*
+ * Theoretical race here: the in-service queue can be NULL or
+ * different from the queue that was idling if a new request
+ * arrives for the current queue and there is a full dispatch
+ * cycle that changes the in-service queue. This can hardly
+ * happen, but in the worst case we just expire a queue too
+ * early.
+ */
+ if (bfqq)
+ bfq_idle_slice_timer_body(bfqq);
+
+ return HRTIMER_NORESTART;
+}
+
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+ struct bfq_queue **bfqq_ptr)
+{
+ struct bfq_queue *bfqq = *bfqq_ptr;
+
+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+ if (bfqq) {
+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+
+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+ bfqq, bfqq->ref);
+ bfq_put_queue(bfqq);
+ *bfqq_ptr = NULL;
+ }
+}
+
+/*
+ * Release all the bfqg references to its async queues. If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
+ */
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ struct bfq_queue *bfqq, *n;
+
+ hrtimer_cancel(&bfqd->idle_slice_timer);
+
+ spin_lock_irq(&bfqd->lock);
+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ spin_unlock_irq(&bfqd->lock);
+
+ hrtimer_cancel(&bfqd->idle_slice_timer);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
+#else
+ spin_lock_irq(&bfqd->lock);
+ bfq_put_async_queues(bfqd, bfqd->root_group);
+ kfree(bfqd->root_group);
+ spin_unlock_irq(&bfqd->lock);
+#endif
+
+ kfree(bfqd);
+}
+
+static void bfq_init_root_group(struct bfq_group *root_group,
+ struct bfq_data *bfqd)
+{
+ int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ root_group->entity.parent = NULL;
+ root_group->my_entity = NULL;
+ root_group->bfqd = bfqd;
+#endif
+ root_group->rq_pos_tree = RB_ROOT;
+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+ root_group->sched_data.bfq_class_idle_last_service = jiffies;
+}
+
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+ struct bfq_data *bfqd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+ if (!bfqd) {
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+ }
+ eq->elevator_data = bfqd;
+
+ spin_lock_irq(q->queue_lock);
+ q->elevator = eq;
+ spin_unlock_irq(q->queue_lock);
+
+ /*
+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+ * Grab a permanent reference to it, so that the normal code flow
+ * will not attempt to free it.
+ */
+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+ bfqd->oom_bfqq.ref++;
+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+ bfqd->oom_bfqq.entity.new_weight =
+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+ /* oom_bfqq does not participate to bursts */
+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+
+ /*
+ * Trigger weight initialization, according to ioprio, at the
+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+ * class won't be changed any more.
+ */
+ bfqd->oom_bfqq.entity.prio_changed = 1;
+
+ bfqd->queue = q;
+
+ INIT_LIST_HEAD(&bfqd->dispatch);
+
+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+
+ bfqd->queue_weights_tree = RB_ROOT;
+ bfqd->group_weights_tree = RB_ROOT;
+
+ INIT_LIST_HEAD(&bfqd->active_list);
+ INIT_LIST_HEAD(&bfqd->idle_list);
+ INIT_HLIST_HEAD(&bfqd->burst_list);
+
+ bfqd->hw_tag = -1;
+
+ bfqd->bfq_max_budget = bfq_default_max_budget;
+
+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+ bfqd->bfq_back_max = bfq_back_max;
+ bfqd->bfq_back_penalty = bfq_back_penalty;
+ bfqd->bfq_slice_idle = bfq_slice_idle;
+ bfqd->bfq_timeout = bfq_timeout;
+
+ bfqd->bfq_requests_within_timer = 120;
+
+ bfqd->bfq_large_burst_thresh = 8;
+ bfqd->bfq_burst_interval = msecs_to_jiffies(180);
+
+ bfqd->low_latency = true;
+
+ /*
+ * Trade-off between responsiveness and fairness.
+ */
+ bfqd->bfq_wr_coeff = 30;
+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+ bfqd->bfq_wr_max_time = 0;
+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+ bfqd->bfq_wr_max_softrt_rate = 7000; /*
+ * Approximate rate required
+ * to playback or record a
+ * high-definition compressed
+ * video.
+ */
+ bfqd->wr_busy_queues = 0;
+
+ /*
+ * Begin by assuming, optimistically, that the device is a
+ * high-speed one, and that its peak rate is equal to 2/3 of
+ * the highest reference rate.
+ */
+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+ T_fast[blk_queue_nonrot(bfqd->queue)];
+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+ bfqd->device_speed = BFQ_BFQD_FAST;
+
+ spin_lock_init(&bfqd->lock);
+
+ /*
+ * The invocation of the next bfq_create_group_hierarchy
+ * function is the head of a chain of function calls
+ * (bfq_create_group_hierarchy->blkcg_activate_policy->
+ * blk_mq_freeze_queue) that may lead to the invocation of the
+ * has_work hook function. For this reason,
+ * bfq_create_group_hierarchy is invoked only after all
+ * scheduler data has been initialized, apart from the fields
+ * that can be initialized only after invoking
+ * bfq_create_group_hierarchy. This, in particular, enables
+ * has_work to correctly return false. Of course, to avoid
+ * other inconsistencies, the blk-mq stack must then refrain
+ * from invoking further scheduler hooks before this init
+ * function is finished.
+ */
+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+ if (!bfqd->root_group)
+ goto out_free;
+ bfq_init_root_group(bfqd->root_group, bfqd);
+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
+
+ return 0;
+
+out_free:
+ kfree(bfqd);
+ kobject_put(&eq->kobj);
+ return -ENOMEM;
+}
+
+static void bfq_slab_kill(void)
+{
+ kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+ bfq_pool = KMEM_CACHE(bfq_queue, 0);
+ if (!bfq_pool)
+ return -ENOMEM;
+ return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+ return sprintf(page, "%u\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+ size_t count)
+{
+ unsigned long new_val;
+ int ret = kstrtoul(page, 10, &new_val);
+
+ if (ret == 0)
+ *var = new_val;
+
+ return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ u64 __data = __VAR; \
+ if (__CONV == 1) \
+ __data = jiffies_to_msecs(__data); \
+ else if (__CONV == 2) \
+ __data = div_u64(__data, NSEC_PER_MSEC); \
+ return bfq_var_show(__data, (page)); \
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+#undef SHOW_FUNCTION
+
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ u64 __data = __VAR; \
+ __data = div_u64(__data, NSEC_PER_USEC); \
+ return bfq_var_show(__data, (page)); \
+}
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
+#undef USEC_SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+static ssize_t \
+__FUNC(struct elevator_queue *e, const char *page, size_t count) \
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ unsigned long uninitialized_var(__data); \
+ int ret = bfq_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ if (__CONV == 1) \
+ *(__PTR) = msecs_to_jiffies(__data); \
+ else if (__CONV == 2) \
+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \
+ else \
+ *(__PTR) = __data; \
+ return ret; \
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+ INT_MAX, 2);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+ INT_MAX, 2);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+ INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
+#undef STORE_FUNCTION
+
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
+{ \
+ struct bfq_data *bfqd = e->elevator_data; \
+ unsigned long uninitialized_var(__data); \
+ int ret = bfq_var_store(&__data, (page), count); \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \
+ return ret; \
+}
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
+ UINT_MAX);
+#undef USEC_STORE_FUNCTION
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data == 0)
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+ else {
+ if (__data > INT_MAX)
+ __data = INT_MAX;
+ bfqd->bfq_max_budget = __data;
+ }
+
+ bfqd->bfq_user_max_budget = __data;
+
+ return ret;
+}
+
+/*
+ * Leaving this name to preserve name compatibility with cfq
+ * parameters, but this timeout is used for both sync and async.
+ */
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data < 1)
+ __data = 1;
+ else if (__data > INT_MAX)
+ __data = INT_MAX;
+
+ bfqd->bfq_timeout = msecs_to_jiffies(__data);
+ if (bfqd->bfq_user_max_budget == 0)
+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+
+ return ret;
+}
+
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data > 1)
+ __data = 1;
+ if (!bfqd->strict_guarantees && __data == 1
+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
+
+ bfqd->strict_guarantees = __data;
+
+ return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+ const char *page, size_t count)
+{
+ struct bfq_data *bfqd = e->elevator_data;
+ unsigned long uninitialized_var(__data);
+ int ret = bfq_var_store(&__data, (page), count);
+
+ if (__data > 1)
+ __data = 1;
+ if (__data == 0 && bfqd->low_latency != 0)
+ bfq_end_wr(bfqd);
+ bfqd->low_latency = __data;
+
+ return ret;
+}
+
+#define BFQ_ATTR(name) \
+ __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+ BFQ_ATTR(fifo_expire_sync),
+ BFQ_ATTR(fifo_expire_async),
+ BFQ_ATTR(back_seek_max),
+ BFQ_ATTR(back_seek_penalty),
+ BFQ_ATTR(slice_idle),
+ BFQ_ATTR(slice_idle_us),
+ BFQ_ATTR(max_budget),
+ BFQ_ATTR(timeout_sync),
+ BFQ_ATTR(strict_guarantees),
+ BFQ_ATTR(low_latency),
+ __ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq_mq = {
+ .ops.mq = {
+ .get_rq_priv = bfq_get_rq_private,
+ .put_rq_priv = bfq_put_rq_private,
+ .exit_icq = bfq_exit_icq,
+ .insert_requests = bfq_insert_requests,
+ .dispatch_request = bfq_dispatch_request,
+ .next_request = elv_rb_latter_request,
+ .former_request = elv_rb_former_request,
+ .allow_merge = bfq_allow_bio_merge,
+ .bio_merge = bfq_bio_merge,
+ .request_merge = bfq_request_merge,
+ .requests_merged = bfq_requests_merged,
+ .request_merged = bfq_request_merged,
+ .has_work = bfq_has_work,
+ .init_sched = bfq_init_queue,
+ .exit_sched = bfq_exit_queue,
+ },
+
+ .uses_mq = true,
+ .icq_size = sizeof(struct bfq_io_cq),
+ .icq_align = __alignof__(struct bfq_io_cq),
+ .elevator_attrs = bfq_attrs,
+ .elevator_name = "bfq",
+ .elevator_owner = THIS_MODULE,
+};
+
+static int __init bfq_init(void)
+{
+ int ret;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ ret = blkcg_policy_register(&blkcg_policy_bfq);
+ if (ret)
+ return ret;
+#endif
+
+ ret = -ENOMEM;
+ if (bfq_slab_setup())
+ goto err_pol_unreg;
+
+ /*
+ * Times to load large popular applications for the typical
+ * systems installed on the reference devices (see the
+ * comments before the definitions of the next two
+ * arrays). Actually, we use slightly slower values, as the
+ * estimated peak rate tends to be smaller than the actual
+ * peak rate. The reason for this last fact is that estimates
+ * are computed over much shorter time intervals than the long
+ * intervals typically used for benchmarking. Why? First, to
+ * adapt more quickly to variations. Second, because an I/O
+ * scheduler cannot rely on a peak-rate-evaluation workload to
+ * be run for a long time.
+ */
+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */
+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
+
+ /*
+ * Thresholds that determine the switch between speed classes
+ * (see the comments before the definition of the array
+ * device_speed_thresh). These thresholds are biased towards
+ * transitions to the fast class. This is safer than the
+ * opposite bias. In fact, a wrong transition to the slow
+ * class results in short weight-raising periods, because the
+ * speed of the device then tends to be higher that the
+ * reference peak rate. On the opposite end, a wrong
+ * transition to the fast class tends to increase
+ * weight-raising periods, because of the opposite reason.
+ */
+ device_speed_thresh[0] = (4 * R_slow[0]) / 3;
+ device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+
+ ret = elv_register(&iosched_bfq_mq);
+ if (ret)
+ goto err_pol_unreg;
+
+ return 0;
+
+err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+ return ret;
+}
+
+static void __exit bfq_exit(void)
+{
+ elv_unregister(&iosched_bfq_mq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+ bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Paolo Valente");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler");
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
new file mode 100644
index 0000000000000..ae783c06dfd9c
--- /dev/null
+++ b/block/bfq-iosched.h
@@ -0,0 +1,941 @@
+/*
+ * Header file for the BFQ I/O scheduler: data structures and
+ * prototypes of interface functions among BFQ components.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES 3
+#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
+
+#define BFQ_MIN_WEIGHT 1
+#define BFQ_MAX_WEIGHT 1000
+#define BFQ_WEIGHT_CONVERSION_COEFF 10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO 4
+
+#define BFQ_WEIGHT_LEGACY_DFL 100
+#define BFQ_DEFAULT_GRP_IOPRIO 0
+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
+
+/*
+ * Soft real-time applications are extremely more latency sensitive
+ * than interactive ones. Over-raise the weight of the former to
+ * privilege them against the latter.
+ */
+#define BFQ_SOFTRT_WEIGHT_FACTOR 100
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree. All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+ /* tree for active entities (i.e., those backlogged) */
+ struct rb_root active;
+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+ struct rb_root idle;
+
+ /* idle entity with minimum F_i */
+ struct bfq_entity *first_idle;
+ /* idle entity with maximum F_i */
+ struct bfq_entity *last_idle;
+
+ /* scheduler virtual time */
+ u64 vtime;
+ /* scheduler weight sum; active and idle entities contribute to it */
+ unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ *
+ * bfq_sched_data is the basic scheduler queue. It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup. @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+ /* entity in service */
+ struct bfq_entity *in_service_entity;
+ /* head-of-line entity (see comments above) */
+ struct bfq_entity *next_in_service;
+ /* array of service trees, one per ioprio_class */
+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+ /* last time CLASS_IDLE was served */
+ unsigned long bfq_class_idle_last_service;
+
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ * with a given weight.
+ */
+struct bfq_weight_counter {
+ unsigned int weight; /* weight of the entities this counter refers to */
+ unsigned int num_active; /* nr of active entities with this weight */
+ /*
+ * Weights tree member (see bfq_data's @queue_weights_tree and
+ * @group_weights_tree)
+ */
+ struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy. Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now. Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag. As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ. When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed. All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+ /* service_tree member */
+ struct rb_node rb_node;
+ /* pointer to the weight counter associated with this entity */
+ struct bfq_weight_counter *weight_counter;
+
+ /*
+ * Flag, true if the entity is on a tree (either the active or
+ * the idle one of its service_tree) or is in service.
+ */
+ bool on_st;
+
+ /* B-WF2Q+ start and finish timestamps [sectors/weight] */
+ u64 start, finish;
+
+ /* tree the entity is enqueued into; %NULL if not on a tree */
+ struct rb_root *tree;
+
+ /*
+ * minimum start time of the (active) subtree rooted at this
+ * entity; used for O(log N) lookups into active trees
+ */
+ u64 min_start;
+
+ /* amount of service received during the last service slot */
+ int service;
+
+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+ int budget;
+
+ /* weight of the queue */
+ int weight;
+ /* next weight if a change is in progress */
+ int new_weight;
+
+ /* original weight, used to implement weight boosting */
+ int orig_weight;
+
+ /* parent entity, for hierarchical scheduling */
+ struct bfq_entity *parent;
+
+ /*
+ * For non-leaf nodes in the hierarchy, the associated
+ * scheduler queue, %NULL on leaf nodes.
+ */
+ struct bfq_sched_data *my_sched_data;
+ /* the scheduler queue this entity belongs to */
+ struct bfq_sched_data *sched_data;
+
+ /* flag, set to request a weight, ioprio or ioprio_class change */
+ int prio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ */
+struct bfq_ttime {
+ /* completion time of the last request */
+ u64 last_end_request;
+
+ /* total process thinktime */
+ u64 ttime_total;
+ /* number of thinktime samples */
+ unsigned long ttime_samples;
+ /* average process thinktime */
+ u64 ttime_mean;
+};
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it is async or shared between cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+ /* reference counter */
+ int ref;
+ /* parent bfq_data */
+ struct bfq_data *bfqd;
+
+ /* current ioprio and ioprio class */
+ unsigned short ioprio, ioprio_class;
+ /* next ioprio and ioprio class if a change is in progress */
+ unsigned short new_ioprio, new_ioprio_class;
+
+ /*
+ * Shared bfq_queue if queue is cooperating with one or more
+ * other queues.
+ */
+ struct bfq_queue *new_bfqq;
+ /* request-position tree member (see bfq_group's @rq_pos_tree) */
+ struct rb_node pos_node;
+ /* request-position tree root (see bfq_group's @rq_pos_tree) */
+ struct rb_root *pos_root;
+
+ /* sorted list of pending requests */
+ struct rb_root sort_list;
+ /* if fifo isn't expired, next request to serve */
+ struct request *next_rq;
+ /* number of sync and async requests queued */
+ int queued[2];
+ /* number of requests currently allocated */
+ int allocated;
+ /* number of pending metadata requests */
+ int meta_pending;
+ /* fifo list of requests in sort_list */
+ struct list_head fifo;
+
+ /* entity representing this queue in the scheduler */
+ struct bfq_entity entity;
+
+ /* maximum budget allowed from the feedback mechanism */
+ int max_budget;
+ /* budget expiration (in jiffies) */
+ unsigned long budget_timeout;
+
+ /* number of requests on the dispatch list or inside driver */
+ int dispatched;
+
+ /* status flags */
+ unsigned long flags;
+
+ /* node for active/idle bfqq list inside parent bfqd */
+ struct list_head bfqq_list;
+
+ /* associated @bfq_ttime struct */
+ struct bfq_ttime ttime;
+
+ /* bit vector: a 1 for each seeky requests in history */
+ u32 seek_history;
+
+ /* node for the device's burst list */
+ struct hlist_node burst_list_node;
+
+ /* position of the last request enqueued */
+ sector_t last_request_pos;
+
+ /* Number of consecutive pairs of request completion and
+ * arrival, such that the queue becomes idle after the
+ * completion, but the next request arrives within an idle
+ * time slice; used only if the queue's IO_bound flag has been
+ * cleared.
+ */
+ unsigned int requests_within_timer;
+
+ /* pid of the process owning the queue, used for logging purposes */
+ pid_t pid;
+
+ /*
+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
+ * if the queue is shared.
+ */
+ struct bfq_io_cq *bic;
+
+ /* current maximum weight-raising time for this queue */
+ unsigned long wr_cur_max_time;
+ /*
+ * Minimum time instant such that, only if a new request is
+ * enqueued after this time instant in an idle @bfq_queue with
+ * no outstanding requests, then the task associated with the
+ * queue it is deemed as soft real-time (see the comments on
+ * the function bfq_bfqq_softrt_next_start())
+ */
+ unsigned long soft_rt_next_start;
+ /*
+ * Start time of the current weight-raising period if
+ * the @bfq-queue is being weight-raised, otherwise
+ * finish time of the last weight-raising period.
+ */
+ unsigned long last_wr_start_finish;
+ /* factor by which the weight of this queue is multiplied */
+ unsigned int wr_coeff;
+ /*
+ * Time of the last transition of the @bfq_queue from idle to
+ * backlogged.
+ */
+ unsigned long last_idle_bklogged;
+ /*
+ * Cumulative service received from the @bfq_queue since the
+ * last transition from idle to backlogged.
+ */
+ unsigned long service_from_backlogged;
+
+ /*
+ * Value of wr start time when switching to soft rt
+ */
+ unsigned long wr_start_at_switch_to_srt;
+
+ unsigned long split_time; /* time of last split */
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+ /* associated io_cq structure */
+ struct io_cq icq; /* must be the first member */
+ /* array of two process queues, the sync and the async */
+ struct bfq_queue *bfqq[2];
+ /* per (request_queue, blkcg) ioprio */
+ int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+ /*
+ * Snapshot of the idle window before merging; taken to
+ * remember this value while the queue is merged, so as to be
+ * able to restore it in case of split.
+ */
+ bool saved_idle_window;
+ /*
+ * Same purpose as the previous two fields for the I/O bound
+ * classification of a queue.
+ */
+ bool saved_IO_bound;
+
+ /*
+ * Same purpose as the previous fields for the value of the
+ * field keeping the queue's belonging to a large burst
+ */
+ bool saved_in_large_burst;
+ /*
+ * True if the queue belonged to a burst list before its merge
+ * with another cooperating queue.
+ */
+ bool was_in_burst_list;
+
+ /*
+ * Similar to previous fields: save wr information.
+ */
+ unsigned long saved_wr_coeff;
+ unsigned long saved_last_wr_start_finish;
+ unsigned long saved_wr_start_at_switch_to_srt;
+ unsigned int saved_wr_cur_max_time;
+ struct bfq_ttime saved_ttime;
+};
+
+enum bfq_device_speed {
+ BFQ_BFQD_FAST,
+ BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per-device data structure.
+ *
+ * All the fields are protected by @lock.
+ */
+struct bfq_data {
+ /* device request queue */
+ struct request_queue *queue;
+ /* dispatch queue */
+ struct list_head dispatch;
+
+ /* root bfq_group for the device */
+ struct bfq_group *root_group;
+
+ /*
+ * rbtree of weight counters of @bfq_queues, sorted by
+ * weight. Used to keep track of whether all @bfq_queues have
+ * the same weight. The tree contains one counter for each
+ * distinct weight associated to some active and not
+ * weight-raised @bfq_queue (see the comments to the functions
+ * bfq_weights_tree_[add|remove] for further details).
+ */
+ struct rb_root queue_weights_tree;
+ /*
+ * rbtree of non-queue @bfq_entity weight counters, sorted by
+ * weight. Used to keep track of whether all @bfq_groups have
+ * the same weight. The tree contains one counter for each
+ * distinct weight associated to some active @bfq_group (see
+ * the comments to the functions bfq_weights_tree_[add|remove]
+ * for further details).
+ */
+ struct rb_root group_weights_tree;
+
+ /*
+ * Number of bfq_queues containing requests (including the
+ * queue in service, even if it is idling).
+ */
+ int busy_queues;
+ /* number of weight-raised busy @bfq_queues */
+ int wr_busy_queues;
+ /* number of queued requests */
+ int queued;
+ /* number of requests dispatched and waiting for completion */
+ int rq_in_driver;
+
+ /*
+ * Maximum number of requests in driver in the last
+ * @hw_tag_samples completed requests.
+ */
+ int max_rq_in_driver;
+ /* number of samples used to calculate hw_tag */
+ int hw_tag_samples;
+ /* flag set to one if the driver is showing a queueing behavior */
+ int hw_tag;
+
+ /* number of budgets assigned */
+ int budgets_assigned;
+
+ /*
+ * Timer set when idling (waiting) for the next request from
+ * the queue in service.
+ */
+ struct hrtimer idle_slice_timer;
+
+ /* bfq_queue in service */
+ struct bfq_queue *in_service_queue;
+
+ /* on-disk position of the last served request */
+ sector_t last_position;
+
+ /* time of last request completion (ns) */
+ u64 last_completion;
+
+ /* time of first rq dispatch in current observation interval (ns) */
+ u64 first_dispatch;
+ /* time of last rq dispatch in current observation interval (ns) */
+ u64 last_dispatch;
+
+ /* beginning of the last budget */
+ ktime_t last_budget_start;
+ /* beginning of the last idle slice */
+ ktime_t last_idling_start;
+
+ /* number of samples in current observation interval */
+ int peak_rate_samples;
+ /* num of samples of seq dispatches in current observation interval */
+ u32 sequential_samples;
+ /* total num of sectors transferred in current observation interval */
+ u64 tot_sectors_dispatched;
+ /* max rq size seen during current observation interval (sectors) */
+ u32 last_rq_max_size;
+ /* time elapsed from first dispatch in current observ. interval (us) */
+ u64 delta_from_first;
+ /*
+ * Current estimate of the device peak rate, measured in
+ * [BFQ_RATE_SHIFT * sectors/usec]. The left-shift by
+ * BFQ_RATE_SHIFT is performed to increase precision in
+ * fixed-point calculations.
+ */
+ u32 peak_rate;
+
+ /* maximum budget allotted to a bfq_queue before rescheduling */
+ int bfq_max_budget;
+
+ /* list of all the bfq_queues active on the device */
+ struct list_head active_list;
+ /* list of all the bfq_queues idle on the device */
+ struct list_head idle_list;
+
+ /*
+ * Timeout for async/sync requests; when it fires, requests
+ * are served in fifo order.
+ */
+ u64 bfq_fifo_expire[2];
+ /* weight of backward seeks wrt forward ones */
+ unsigned int bfq_back_penalty;
+ /* maximum allowed backward seek */
+ unsigned int bfq_back_max;
+ /* maximum idling time */
+ u32 bfq_slice_idle;
+
+ /* user-configured max budget value (0 for auto-tuning) */
+ int bfq_user_max_budget;
+ /*
+ * Timeout for bfq_queues to consume their budget; used to
+ * prevent seeky queues from imposing long latencies to
+ * sequential or quasi-sequential ones (this also implies that
+ * seeky queues cannot receive guarantees in the service
+ * domain; after a timeout they are charged for the time they
+ * have been in service, to preserve fairness among them, but
+ * without service-domain guarantees).
+ */
+ unsigned int bfq_timeout;
+
+ /*
+ * Number of consecutive requests that must be issued within
+ * the idle time slice to set again idling to a queue which
+ * was marked as non-I/O-bound (see the definition of the
+ * IO_bound flag for further details).
+ */
+ unsigned int bfq_requests_within_timer;
+
+ /*
+ * Force device idling whenever needed to provide accurate
+ * service guarantees, without caring about throughput
+ * issues. CAVEAT: this may even increase latencies, in case
+ * of useless idling for processes that did stop doing I/O.
+ */
+ bool strict_guarantees;
+
+ /*
+ * Last time at which a queue entered the current burst of
+ * queues being activated shortly after each other; for more
+ * details about this and the following parameters related to
+ * a burst of activations, see the comments on the function
+ * bfq_handle_burst.
+ */
+ unsigned long last_ins_in_burst;
+ /*
+ * Reference time interval used to decide whether a queue has
+ * been activated shortly after @last_ins_in_burst.
+ */
+ unsigned long bfq_burst_interval;
+ /* number of queues in the current burst of queue activations */
+ int burst_size;
+
+ /* common parent entity for the queues in the burst */
+ struct bfq_entity *burst_parent_entity;
+ /* Maximum burst size above which the current queue-activation
+ * burst is deemed as 'large'.
+ */
+ unsigned long bfq_large_burst_thresh;
+ /* true if a large queue-activation burst is in progress */
+ bool large_burst;
+ /*
+ * Head of the burst list (as for the above fields, more
+ * details in the comments on the function bfq_handle_burst).
+ */
+ struct hlist_head burst_list;
+
+ /* if set to true, low-latency heuristics are enabled */
+ bool low_latency;
+ /*
+ * Maximum factor by which the weight of a weight-raised queue
+ * is multiplied.
+ */
+ unsigned int bfq_wr_coeff;
+ /* maximum duration of a weight-raising period (jiffies) */
+ unsigned int bfq_wr_max_time;
+
+ /* Maximum weight-raising duration for soft real-time processes */
+ unsigned int bfq_wr_rt_max_time;
+ /*
+ * Minimum idle period after which weight-raising may be
+ * reactivated for a queue (in jiffies).
+ */
+ unsigned int bfq_wr_min_idle_time;
+ /*
+ * Minimum period between request arrivals after which
+ * weight-raising may be reactivated for an already busy async
+ * queue (in jiffies).
+ */
+ unsigned long bfq_wr_min_inter_arr_async;
+
+ /* Max service-rate for a soft real-time queue, in sectors/sec */
+ unsigned int bfq_wr_max_softrt_rate;
+ /*
+ * Cached value of the product R*T, used for computing the
+ * maximum duration of weight raising automatically.
+ */
+ u64 RT_prod;
+ /* device-speed class for the low-latency heuristic */
+ enum bfq_device_speed device_speed;
+
+ /* fallback dummy bfqq for extreme OOM conditions */
+ struct bfq_queue oom_bfqq;
+
+ spinlock_t lock;
+
+ /*
+ * bic associated with the task issuing current bio for
+ * merging. This and the next field are used as a support to
+ * be able to perform the bic lookup, needed by bio-merge
+ * functions, before the scheduler lock is taken, and thus
+ * avoid taking the request-queue lock while the scheduler
+ * lock is being held.
+ */
+ struct bfq_io_cq *bio_bic;
+ /* bfqq associated with the task issuing current bio for merging */
+ struct bfq_queue *bio_bfqq;
+};
+
+enum bfqq_state_flags {
+ BFQQF_just_created = 0, /* queue just allocated */
+ BFQQF_busy, /* has requests or is in service */
+ BFQQF_wait_request, /* waiting for a request */
+ BFQQF_non_blocking_wait_rq, /*
+ * waiting for a request
+ * without idling the device
+ */
+ BFQQF_fifo_expire, /* FIFO checked in this slice */
+ BFQQF_idle_window, /* slice idling enabled */
+ BFQQF_sync, /* synchronous queue */
+ BFQQF_IO_bound, /*
+ * bfqq has timed-out at least once
+ * having consumed at most 2/10 of
+ * its budget
+ */
+ BFQQF_in_large_burst, /*
+ * bfqq activated in a large burst,
+ * see comments to bfq_handle_burst.
+ */
+ BFQQF_softrt_update, /*
+ * may need softrt-next-start
+ * update
+ */
+ BFQQF_coop, /* bfqq is shared */
+ BFQQF_split_coop /* shared bfqq will be split */
+};
+
+#define BFQ_BFQQ_FNS(name) \
+void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \
+void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \
+int bfq_bfqq_##name(const struct bfq_queue *bfqq);
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+ BFQQE_TOO_IDLE = 0, /*
+ * queue has been idling for
+ * too long
+ */
+ BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */
+ BFQQE_BUDGET_EXHAUSTED, /* budget consumed */
+ BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */
+ BFQQE_PREEMPTED /* preemption in progress */
+};
+
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ /* number of ios merged */
+ struct blkg_rwstat merged;
+ /* total time spent on device in ns, may not be accurate w/ queueing */
+ struct blkg_rwstat service_time;
+ /* total time spent waiting in scheduler queue in ns */
+ struct blkg_rwstat wait_time;
+ /* number of IOs queued up */
+ struct blkg_rwstat queued;
+ /* total disk time and nr sectors dispatched by this group */
+ struct blkg_stat time;
+ /* sum of number of ios queued across all samples */
+ struct blkg_stat avg_queue_size_sum;
+ /* count of samples taken for average */
+ struct blkg_stat avg_queue_size_samples;
+ /* how many times this group has been removed from service tree */
+ struct blkg_stat dequeue;
+ /* total time spent waiting for it to be assigned a timeslice. */
+ struct blkg_stat group_wait_time;
+ /* time spent idling for this blkcg_gq */
+ struct blkg_stat idle_time;
+ /* total time with empty current active q with other requests queued */
+ struct blkg_stat empty_time;
+ /* fields after this shouldn't be cleared on stat reset */
+ uint64_t start_group_wait_time;
+ uint64_t start_idle_time;
+ uint64_t start_empty_time;
+ uint16_t flags;
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+ /* must be the first member */
+ struct blkcg_policy_data pd;
+
+ unsigned int weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ * both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ * the group, one queue per ioprio value per ioprio_class,
+ * except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ * to avoid too many special cases during group creation/
+ * migration.
+ * @stats: stats for this bfqg.
+ * @active_entities: number of active entities belonging to the group;
+ * unused for the root group. Used to know whether there
+ * are groups with more than one active @bfq_entity
+ * (see the comments to the function
+ * bfq_bfqq_may_idle()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ * determining if two or more queues have interleaving
+ * requests (see bfq_find_close_cooperator()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ * o @bfqd is protected by the queue lock, RCU is used to access it
+ * from the readers.
+ * o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+ /* must be the first member */
+ struct blkg_policy_data pd;
+
+ struct bfq_entity entity;
+ struct bfq_sched_data sched_data;
+
+ void *bfqd;
+
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_idle_bfqq;
+
+ struct bfq_entity *my_entity;
+
+ int active_entities;
+
+ struct rb_root rq_pos_tree;
+
+ struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+ struct bfq_sched_data sched_data;
+
+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+ struct bfq_queue *async_idle_bfqq;
+
+ struct rb_root rq_pos_tree;
+};
+#endif
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+/* --------------- main algorithm interface ----------------- */
+
+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+extern const int bfq_timeout;
+
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root);
+void bfq_weights_tree_remove(struct bfq_data *bfqd, struct bfq_entity *entity,
+ struct rb_root *root);
+void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool compensate, enum bfqq_expiration reason);
+void bfq_put_queue(struct bfq_queue *bfqq);
+void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+void bfq_schedule_dispatch(struct bfq_data *bfqd);
+void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+
+/* ------------ end of main algorithm interface -------------- */
+
+/* ---------------- cgroups-support interface ---------------- */
+
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ unsigned int op);
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_completion(struct bfq_group *bfqg, uint64_t start_time,
+ uint64_t io_start_time, unsigned int op);
+void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
+void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_group *bfqg);
+
+void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
+void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
+void bfq_end_wr_async(struct bfq_data *bfqd);
+struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+ struct blkcg *blkcg);
+struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
+void bfqg_put(struct bfq_group *bfqg);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+extern struct cftype bfq_blkcg_legacy_files[];
+extern struct cftype bfq_blkg_files[];
+extern struct blkcg_policy blkcg_policy_bfq;
+#endif
+
+/* ------------- end of cgroups-support interface ------------- */
+
+/* - interface of the internal hierarchical B-WF2Q+ scheduler - */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* both next loops stop at one of the child entities of the root group */
+#define for_each_entity(entity) \
+ for (; entity ; entity = entity->parent)
+
+/*
+ * For each iteration, compute parent in advance, so as to be safe if
+ * entity is deallocated during the iteration. Such a deallocation may
+ * happen as a consequence of a bfq_put_queue that frees the bfq_queue
+ * containing entity.
+ */
+#define for_each_entity_safe(entity, parent) \
+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+/*
+ * Next two macros are fake loops when cgroups support is not
+ * enabled. I fact, in such a case, there is only one level to go up
+ * (to reach the root group).
+ */
+#define for_each_entity(entity) \
+ for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+ for (parent = NULL; entity ; entity = parent)
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
+struct bfq_entity *bfq_entity_of(struct rb_node *node);
+unsigned short bfq_ioprio_to_weight(int ioprio);
+void bfq_put_idle_entity(struct bfq_service_tree *st,
+ struct bfq_entity *entity);
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+ struct bfq_entity *entity);
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served);
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ unsigned long time_ms);
+bool __bfq_deactivate_entity(struct bfq_entity *entity,
+ bool ins_into_idle_tree);
+bool next_queue_may_preempt(struct bfq_data *bfqd);
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd);
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd);
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool ins_into_idle_tree, bool expiration);
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool expiration);
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+/* --------------- end of interface of B-WF2Q+ ---------------- */
+
+/* Logging facilities. */
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ __pbuf, ##args); \
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
+ char __pbuf[128]; \
+ \
+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \
+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \
+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
+ ##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+#endif /* _BFQ_H */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
new file mode 100644
index 0000000000000..b4fc3e4260b71
--- /dev/null
+++ b/block/bfq-wf2q.c
@@ -0,0 +1,1616 @@
+/*
+ * Hierarchical Budget Worst-case Fair Weighted Fair Queueing
+ * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O
+ * scheduler schedules generic entities. The latter can represent
+ * either single bfq queues (associated with processes) or groups of
+ * bfq queues (associated with cgroups).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include "bfq-iosched.h"
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+ return (s64)(a - b) > 0;
+}
+
+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree)
+{
+ struct rb_node *node = tree->rb_node;
+
+ return rb_entry(node, struct bfq_entity, rb_node);
+}
+
+static unsigned int bfq_class_idx(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ return bfqq ? bfqq->ioprio_class - 1 :
+ BFQ_DEFAULT_GRP_CLASS - 1;
+}
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd);
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service);
+
+/**
+ * bfq_update_next_in_service - update sd->next_in_service
+ * @sd: sched_data for which to perform the update.
+ * @new_entity: if not NULL, pointer to the entity whose activation,
+ * requeueing or repositionig triggered the invocation of
+ * this function.
+ *
+ * This function is called to update sd->next_in_service, which, in
+ * its turn, may change as a consequence of the insertion or
+ * extraction of an entity into/from one of the active trees of
+ * sd. These insertions/extractions occur as a consequence of
+ * activations/deactivations of entities, with some activations being
+ * 'true' activations, and other activations being requeueings (i.e.,
+ * implementing the second, requeueing phase of the mechanism used to
+ * reposition an entity in its active tree; see comments on
+ * __bfq_activate_entity and __bfq_requeue_entity for details). In
+ * both the last two activation sub-cases, new_entity points to the
+ * just activated or requeued entity.
+ *
+ * Returns true if sd->next_in_service changes in such a way that
+ * entity->parent may become the next_in_service for its parent
+ * entity.
+ */
+static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
+ struct bfq_entity *new_entity)
+{
+ struct bfq_entity *next_in_service = sd->next_in_service;
+ bool parent_sched_may_change = false;
+
+ /*
+ * If this update is triggered by the activation, requeueing
+ * or repositiong of an entity that does not coincide with
+ * sd->next_in_service, then a full lookup in the active tree
+ * can be avoided. In fact, it is enough to check whether the
+ * just-modified entity has a higher priority than
+ * sd->next_in_service, or, even if it has the same priority
+ * as sd->next_in_service, is eligible and has a lower virtual
+ * finish time than sd->next_in_service. If this compound
+ * condition holds, then the new entity becomes the new
+ * next_in_service. Otherwise no change is needed.
+ */
+ if (new_entity && new_entity != sd->next_in_service) {
+ /*
+ * Flag used to decide whether to replace
+ * sd->next_in_service with new_entity. Tentatively
+ * set to true, and left as true if
+ * sd->next_in_service is NULL.
+ */
+ bool replace_next = true;
+
+ /*
+ * If there is already a next_in_service candidate
+ * entity, then compare class priorities or timestamps
+ * to decide whether to replace sd->service_tree with
+ * new_entity.
+ */
+ if (next_in_service) {
+ unsigned int new_entity_class_idx =
+ bfq_class_idx(new_entity);
+ struct bfq_service_tree *st =
+ sd->service_tree + new_entity_class_idx;
+
+ /*
+ * For efficiency, evaluate the most likely
+ * sub-condition first.
+ */
+ replace_next =
+ (new_entity_class_idx ==
+ bfq_class_idx(next_in_service)
+ &&
+ !bfq_gt(new_entity->start, st->vtime)
+ &&
+ bfq_gt(next_in_service->finish,
+ new_entity->finish))
+ ||
+ new_entity_class_idx <
+ bfq_class_idx(next_in_service);
+ }
+
+ if (replace_next)
+ next_in_service = new_entity;
+ } else /* invoked because of a deactivation: lookup needed */
+ next_in_service = bfq_lookup_next_entity(sd);
+
+ if (next_in_service) {
+ parent_sched_may_change = !sd->next_in_service ||
+ bfq_update_parent_budget(next_in_service);
+ }
+
+ sd->next_in_service = next_in_service;
+
+ if (!next_in_service)
+ return parent_sched_may_change;
+
+ return parent_sched_may_change;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *group_entity = bfqq->entity.parent;
+
+ if (!group_entity)
+ group_entity = &bfqq->bfqd->root_group->entity;
+
+ return container_of(group_entity, struct bfq_group, entity);
+}
+
+/*
+ * Returns true if this budget changes may let next_in_service->parent
+ * become the next_in_service entity for its parent entity.
+ */
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+ struct bfq_entity *bfqg_entity;
+ struct bfq_group *bfqg;
+ struct bfq_sched_data *group_sd;
+ bool ret = false;
+
+ group_sd = next_in_service->sched_data;
+
+ bfqg = container_of(group_sd, struct bfq_group, sched_data);
+ /*
+ * bfq_group's my_entity field is not NULL only if the group
+ * is not the root group. We must not touch the root entity
+ * as it must never become an in-service entity.
+ */
+ bfqg_entity = bfqg->my_entity;
+ if (bfqg_entity) {
+ if (bfqg_entity->budget > next_in_service->budget)
+ ret = true;
+ bfqg_entity->budget = next_in_service->budget;
+ }
+
+ return ret;
+}
+
+/*
+ * This function tells whether entity stops being a candidate for next
+ * service, according to the following logic.
+ *
+ * This function is invoked for an entity that is about to be set in
+ * service. If such an entity is a queue, then the entity is no longer
+ * a candidate for next service (i.e, a candidate entity to serve
+ * after the in-service entity is expired). The function then returns
+ * true.
+ *
+ * In contrast, the entity could stil be a candidate for next service
+ * if it is not a queue, and has more than one child. In fact, even if
+ * one of its children is about to be set in service, other children
+ * may still be the next to serve. As a consequence, a non-queue
+ * entity is not a candidate for next-service only if it has only one
+ * child. And only if this condition holds, then the function returns
+ * true for a non-queue entity.
+ */
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+ struct bfq_group *bfqg;
+
+ if (bfq_entity_to_bfqq(entity))
+ return true;
+
+ bfqg = container_of(entity, struct bfq_group, entity);
+
+ if (bfqg->active_entities == 1)
+ return true;
+
+ return false;
+}
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+ return bfqq->bfqd->root_group;
+}
+
+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
+{
+ return false;
+}
+
+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
+{
+ return true;
+}
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+/*
+ * Shift for timestamp calculations. This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT 22
+
+struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = NULL;
+
+ if (!entity->my_sched_data)
+ bfqq = container_of(entity, struct bfq_queue, entity);
+
+ return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+ u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+ do_div(d, weight);
+ return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->finish = entity->start +
+ bfq_delta(service, entity->weight);
+
+ if (bfqq) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "calc_finish: serv %lu, w %d",
+ service, entity->weight);
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "calc_finish: start %llu, finish %llu, delta %llu",
+ entity->start, entity->finish,
+ bfq_delta(service, entity->weight));
+ }
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity. This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+ struct bfq_entity *entity = NULL;
+
+ if (node)
+ entity = rb_entry(node, struct bfq_entity, rb_node);
+
+ return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+ entity->tree = NULL;
+ rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *next;
+
+ if (entity == st->first_idle) {
+ next = rb_next(&entity->rb_node);
+ st->first_idle = bfq_entity_of(next);
+ }
+
+ if (entity == st->last_idle) {
+ next = rb_prev(&entity->rb_node);
+ st->last_idle = bfq_entity_of(next);
+ }
+
+ bfq_extract(&st->idle, entity);
+
+ if (bfqq)
+ list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+ struct bfq_entity *entry;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*node) {
+ parent = *node;
+ entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+ if (bfq_gt(entry->finish, entity->finish))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+
+ rb_link_node(&entity->rb_node, parent, node);
+ rb_insert_color(&entity->rb_node, root);
+
+ entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree. The function assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+ struct bfq_entity *child;
+
+ if (node) {
+ child = rb_entry(node, struct bfq_entity, rb_node);
+ if (bfq_gt(entity->min_start, child->min_start))
+ entity->min_start = child->min_start;
+ }
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value. The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+
+ entity->min_start = entity->start;
+ bfq_update_min(entity, node->rb_right);
+ bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update. This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root. The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+ struct rb_node *parent;
+
+up:
+ bfq_update_active_node(node);
+
+ parent = rb_parent(node);
+ if (!parent)
+ return;
+
+ if (node == parent->rb_left && parent->rb_right)
+ bfq_update_active_node(parent->rb_right);
+ else if (parent->rb_left)
+ bfq_update_active_node(parent->rb_left);
+
+ node = parent;
+ goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ * group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd = NULL;
+ struct bfq_group *bfqg = NULL;
+ struct bfq_data *bfqd = NULL;
+#endif
+
+ bfq_insert(&st->active, entity);
+
+ if (node->rb_left)
+ node = node->rb_left;
+ else if (node->rb_right)
+ node = node->rb_right;
+
+ bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ sd = entity->sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+ if (bfqq)
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else /* bfq_group */
+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+
+ if (bfqg != bfqd->root_group)
+ bfqg->active_entities++;
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+ return max_t(int, 0,
+ IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ if (bfqq) {
+ bfqq->ref++;
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+ bfqq, bfqq->ref);
+ }
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch. If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+ struct rb_node *deepest;
+
+ if (!node->rb_right && !node->rb_left)
+ deepest = rb_parent(node);
+ else if (!node->rb_right)
+ deepest = node->rb_left;
+ else if (!node->rb_left)
+ deepest = node->rb_right;
+ else {
+ deepest = rb_next(node);
+ if (deepest->rb_right)
+ deepest = deepest->rb_right;
+ else if (rb_parent(deepest) != node)
+ deepest = rb_parent(deepest);
+ }
+
+ return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd = NULL;
+ struct bfq_group *bfqg = NULL;
+ struct bfq_data *bfqd = NULL;
+#endif
+
+ node = bfq_find_deepest(&entity->rb_node);
+ bfq_extract(&st->active, entity);
+
+ if (node)
+ bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ sd = entity->sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+ if (bfqq)
+ list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else /* bfq_group */
+ bfq_weights_tree_remove(bfqd, entity,
+ &bfqd->group_weights_tree);
+
+ if (bfqg != bfqd->root_group)
+ bfqg->active_entities--;
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+ struct bfq_entity *entity)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct bfq_entity *first_idle = st->first_idle;
+ struct bfq_entity *last_idle = st->last_idle;
+
+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+ st->first_idle = entity;
+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+ st->last_idle = entity;
+
+ bfq_insert(&st->idle, entity);
+
+ if (bfqq)
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - do not consider entity any longer for scheduling
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ * @is_in_service: true if entity is currently the in-service entity.
+ *
+ * Forget everything about @entity. In addition, if entity represents
+ * a queue, and the latter is not in service, then release the service
+ * reference to the queue (the one taken through bfq_get_entity). In
+ * fact, in this case, there is really no more service reference to
+ * the queue, as the latter is also outside any service tree. If,
+ * instead, the queue is in service, then __bfq_bfqd_reset_in_service
+ * will take care of putting the reference when the queue finally
+ * stops being served.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+ struct bfq_entity *entity,
+ bool is_in_service)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ entity->on_st = false;
+ st->wsum -= entity->weight;
+ if (bfqq && !is_in_service)
+ bfq_put_queue(bfqq);
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity)
+{
+ bfq_idle_extract(st, entity);
+ bfq_forget_entity(st, entity,
+ entity == entity->sched_data->in_service_entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+ struct bfq_entity *first_idle = st->first_idle;
+ struct bfq_entity *last_idle = st->last_idle;
+
+ if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+ !bfq_gt(last_idle->finish, st->vtime)) {
+ /*
+ * Forget the whole idle tree, increasing the vtime past
+ * the last finish time of idle entities.
+ */
+ st->vtime = last_idle->finish;
+ }
+
+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+ bfq_put_idle_entity(st, first_idle);
+}
+
+struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sched_data = entity->sched_data;
+ unsigned int idx = bfq_class_idx(entity);
+
+ return sched_data->service_tree + idx;
+}
+
+
+struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+ struct bfq_entity *entity)
+{
+ struct bfq_service_tree *new_st = old_st;
+
+ if (entity->prio_changed) {
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ unsigned int prev_weight, new_weight;
+ struct bfq_data *bfqd = NULL;
+ struct rb_root *root;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_sched_data *sd;
+ struct bfq_group *bfqg;
+#endif
+
+ if (bfqq)
+ bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ else {
+ sd = entity->my_sched_data;
+ bfqg = container_of(sd, struct bfq_group, sched_data);
+ bfqd = (struct bfq_data *)bfqg->bfqd;
+ }
+#endif
+
+ old_st->wsum -= entity->weight;
+
+ if (entity->new_weight != entity->orig_weight) {
+ if (entity->new_weight < BFQ_MIN_WEIGHT ||
+ entity->new_weight > BFQ_MAX_WEIGHT) {
+ pr_crit("update_weight_prio: new_weight %d\n",
+ entity->new_weight);
+ if (entity->new_weight < BFQ_MIN_WEIGHT)
+ entity->new_weight = BFQ_MIN_WEIGHT;
+ else
+ entity->new_weight = BFQ_MAX_WEIGHT;
+ }
+ entity->orig_weight = entity->new_weight;
+ if (bfqq)
+ bfqq->ioprio =
+ bfq_weight_to_ioprio(entity->orig_weight);
+ }
+
+ if (bfqq)
+ bfqq->ioprio_class = bfqq->new_ioprio_class;
+ entity->prio_changed = 0;
+
+ /*
+ * NOTE: here we may be changing the weight too early,
+ * this will cause unfairness. The correct approach
+ * would have required additional complexity to defer
+ * weight changes to the proper time instants (i.e.,
+ * when entity->finish <= old_st->vtime).
+ */
+ new_st = bfq_entity_service_tree(entity);
+
+ prev_weight = entity->weight;
+ new_weight = entity->orig_weight *
+ (bfqq ? bfqq->wr_coeff : 1);
+ /*
+ * If the weight of the entity changes, remove the entity
+ * from its old weight counter (if there is a counter
+ * associated with the entity), and add it to the counter
+ * associated with its new weight.
+ */
+ if (prev_weight != new_weight) {
+ root = bfqq ? &bfqd->queue_weights_tree :
+ &bfqd->group_weights_tree;
+ bfq_weights_tree_remove(bfqd, entity, root);
+ }
+ entity->weight = new_weight;
+ /*
+ * Add the entity to its weights tree only if it is
+ * not associated with a weight-raised queue.
+ */
+ if (prev_weight != new_weight &&
+ (bfqq ? bfqq->wr_coeff == 1 : 1))
+ /* If we get here, root has been initialized. */
+ bfq_weights_tree_add(bfqd, entity, root);
+
+ new_st->wsum += entity->weight;
+
+ if (new_st != old_st)
+ entity->start = new_st->vtime;
+ }
+
+ return new_st;
+}
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ * service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service. By now,
+ * we keep it to better check consistency.
+ */
+void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_service_tree *st;
+
+ for_each_entity(entity) {
+ st = bfq_entity_service_tree(entity);
+
+ entity->service += served;
+
+ st->vtime += bfq_delta(served, st->wsum);
+ bfq_forget_idle(st);
+ }
+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served);
+}
+
+/**
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
+ * of the time interval during which bfqq has been in
+ * service.
+ * @bfqd: the device
+ * @bfqq: the queue that needs a service update.
+ * @time_ms: the amount of time during which the queue has received service
+ *
+ * If a queue does not consume its budget fast enough, then providing
+ * the queue with service fairness may impair throughput, more or less
+ * severely. For this reason, queues that consume their budget slowly
+ * are provided with time fairness instead of service fairness. This
+ * goal is achieved through the BFQ scheduling engine, even if such an
+ * engine works in the service, and not in the time domain. The trick
+ * is charging these queues with an inflated amount of service, equal
+ * to the amount of service that they would have received during their
+ * service slot if they had been fast, i.e., if their requests had
+ * been dispatched at a rate equal to the estimated peak rate.
+ *
+ * It is worth noting that time fairness can cause important
+ * distortions in terms of bandwidth distribution, on devices with
+ * internal queueing. The reason is that I/O requests dispatched
+ * during the service slot of a queue may be served after that service
+ * slot is finished, and may have a total processing time loosely
+ * correlated with the duration of the service slot. This is
+ * especially true for short service slots.
+ */
+void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ unsigned long time_ms)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+ int tot_serv_to_charge = entity->service;
+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
+
+ if (time_ms > 0 && time_ms < timeout_ms)
+ tot_serv_to_charge =
+ (bfqd->bfq_max_budget * time_ms) / timeout_ms;
+
+ if (tot_serv_to_charge < entity->service)
+ tot_serv_to_charge = entity->service;
+
+ /* Increase budget to avoid inconsistencies */
+ if (tot_serv_to_charge > entity->budget)
+ entity->budget = tot_serv_to_charge;
+
+ bfq_bfqq_served(bfqq,
+ max_t(int, 0, tot_serv_to_charge - entity->service));
+}
+
+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity,
+ struct bfq_service_tree *st,
+ bool backshifted)
+{
+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+ st = __bfq_entity_update_weight_prio(st, entity);
+ bfq_calc_finish(entity, entity->budget);
+
+ /*
+ * If some queues enjoy backshifting for a while, then their
+ * (virtual) finish timestamps may happen to become lower and
+ * lower than the system virtual time. In particular, if
+ * these queues often happen to be idle for short time
+ * periods, and during such time periods other queues with
+ * higher timestamps happen to be busy, then the backshifted
+ * timestamps of the former queues can become much lower than
+ * the system virtual time. In fact, to serve the queues with
+ * higher timestamps while the ones with lower timestamps are
+ * idle, the system virtual time may be pushed-up to much
+ * higher values than the finish timestamps of the idle
+ * queues. As a consequence, the finish timestamps of all new
+ * or newly activated queues may end up being much larger than
+ * those of lucky queues with backshifted timestamps. The
+ * latter queues may then monopolize the device for a lot of
+ * time. This would simply break service guarantees.
+ *
+ * To reduce this problem, push up a little bit the
+ * backshifted timestamps of the queue associated with this
+ * entity (only a queue can happen to have the backshifted
+ * flag set): just enough to let the finish timestamp of the
+ * queue be equal to the current value of the system virtual
+ * time. This may introduce a little unfairness among queues
+ * with backshifted timestamps, but it does not break
+ * worst-case fairness guarantees.
+ *
+ * As a special case, if bfqq is weight-raised, push up
+ * timestamps much less, to keep very low the probability that
+ * this push up causes the backshifted finish timestamps of
+ * weight-raised queues to become higher than the backshifted
+ * finish timestamps of non weight-raised queues.
+ */
+ if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+ unsigned long delta = st->vtime - entity->finish;
+
+ if (bfqq)
+ delta /= bfqq->wr_coeff;
+
+ entity->start += delta;
+ entity->finish += delta;
+ }
+
+ bfq_active_insert(st, entity);
+}
+
+/**
+ * __bfq_activate_entity - handle activation of entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if entity was waiting for a request
+ *
+ * Called for a 'true' activation, i.e., if entity is not active and
+ * one of its children receives a new request.
+ *
+ * Basically, this function updates the timestamps of entity and
+ * inserts entity into its active tree, ater possible extracting it
+ * from its idle tree.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+ bool non_blocking_wait_rq)
+{
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+ bool backshifted = false;
+ unsigned long long min_vstart;
+
+ /* See comments on bfq_fqq_update_budg_for_activation */
+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+ backshifted = true;
+ min_vstart = entity->finish;
+ } else
+ min_vstart = st->vtime;
+
+ if (entity->tree == &st->idle) {
+ /*
+ * Must be on the idle tree, bfq_idle_extract() will
+ * check for that.
+ */
+ bfq_idle_extract(st, entity);
+ entity->start = bfq_gt(min_vstart, entity->finish) ?
+ min_vstart : entity->finish;
+ } else {
+ /*
+ * The finish time of the entity may be invalid, and
+ * it is in the past for sure, otherwise the queue
+ * would have been on the idle tree.
+ */
+ entity->start = min_vstart;
+ st->wsum += entity->weight;
+ /*
+ * entity is about to be inserted into a service tree,
+ * and then set in service: get a reference to make
+ * sure entity does not disappear until it is no
+ * longer in service or scheduled for service.
+ */
+ bfq_get_entity(entity);
+
+ entity->on_st = true;
+ }
+
+ bfq_update_fin_time_enqueue(entity, st, backshifted);
+}
+
+/**
+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity.
+ * @entity: the entity being requeued or repositioned.
+ *
+ * Requeueing is needed if this entity stops being served, which
+ * happens if a leaf descendant entity has expired. On the other hand,
+ * repositioning is needed if the next_inservice_entity for the child
+ * entity has changed. See the comments inside the function for
+ * details.
+ *
+ * Basically, this function: 1) removes entity from its active tree if
+ * present there, 2) updates the timestamps of entity and 3) inserts
+ * entity back into its active tree (in the new, right position for
+ * the new values of the timestamps).
+ */
+static void __bfq_requeue_entity(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+ if (entity == sd->in_service_entity) {
+ /*
+ * We are requeueing the current in-service entity,
+ * which may have to be done for one of the following
+ * reasons:
+ * - entity represents the in-service queue, and the
+ * in-service queue is being requeued after an
+ * expiration;
+ * - entity represents a group, and its budget has
+ * changed because one of its child entities has
+ * just been either activated or requeued for some
+ * reason; the timestamps of the entity need then to
+ * be updated, and the entity needs to be enqueued
+ * or repositioned accordingly.
+ *
+ * In particular, before requeueing, the start time of
+ * the entity must be moved forward to account for the
+ * service that the entity has received while in
+ * service. This is done by the next instructions. The
+ * finish time will then be updated according to this
+ * new value of the start time, and to the budget of
+ * the entity.
+ */
+ bfq_calc_finish(entity, entity->service);
+ entity->start = entity->finish;
+ /*
+ * In addition, if the entity had more than one child
+ * when set in service, then was not extracted from
+ * the active tree. This implies that the position of
+ * the entity in the active tree may need to be
+ * changed now, because we have just updated the start
+ * time of the entity, and we will update its finish
+ * time in a moment (the requeueing is then, more
+ * precisely, a repositioning in this case). To
+ * implement this repositioning, we: 1) dequeue the
+ * entity here, 2) update the finish time and
+ * requeue the entity according to the new
+ * timestamps below.
+ */
+ if (entity->tree)
+ bfq_active_extract(st, entity);
+ } else { /* The entity is already active, and not in service */
+ /*
+ * In this case, this function gets called only if the
+ * next_in_service entity below this entity has
+ * changed, and this change has caused the budget of
+ * this entity to change, which, finally implies that
+ * the finish time of this entity must be
+ * updated. Such an update may cause the scheduling,
+ * i.e., the position in the active tree, of this
+ * entity to change. We handle this change by: 1)
+ * dequeueing the entity here, 2) updating the finish
+ * time and requeueing the entity according to the new
+ * timestamps below. This is the same approach as the
+ * non-extracted-entity sub-case above.
+ */
+ bfq_active_extract(st, entity);
+ }
+
+ bfq_update_fin_time_enqueue(entity, st, false);
+}
+
+static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
+ struct bfq_sched_data *sd,
+ bool non_blocking_wait_rq)
+{
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+
+ if (sd->in_service_entity == entity || entity->tree == &st->active)
+ /*
+ * in service or already queued on the active tree,
+ * requeue or reposition
+ */
+ __bfq_requeue_entity(entity);
+ else
+ /*
+ * Not in service and not queued on its active tree:
+ * the activity is idle and this is a true activation.
+ */
+ __bfq_activate_entity(entity, non_blocking_wait_rq);
+}
+
+
+/**
+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue,
+ * and activate, requeue or reposition all ancestors
+ * for which such an update becomes necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ * @requeue: true if this is a requeue, which implies that bfqq is
+ * being expired; thus ALL its ancestors stop being served and must
+ * therefore be requeued
+ */
+static void bfq_activate_requeue_entity(struct bfq_entity *entity,
+ bool non_blocking_wait_rq,
+ bool requeue)
+{
+ struct bfq_sched_data *sd;
+
+ for_each_entity(entity) {
+ sd = entity->sched_data;
+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
+
+ if (!bfq_update_next_in_service(sd, entity) && !requeue)
+ break;
+ }
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: if false, the entity will not be put into the
+ * idle tree.
+ *
+ * Deactivates an entity, independently from its previous state. Must
+ * be invoked only if entity is on a service tree. Extracts the entity
+ * from that tree, and if necessary and allowed, puts it on the idle
+ * tree.
+ */
+bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+ int is_in_service = entity == sd->in_service_entity;
+
+ if (!entity->on_st) /* entity never activated, or already inactive */
+ return false;
+
+ if (is_in_service)
+ bfq_calc_finish(entity, entity->service);
+
+ if (entity->tree == &st->active)
+ bfq_active_extract(st, entity);
+ else if (!is_in_service && entity->tree == &st->idle)
+ bfq_idle_extract(st, entity);
+
+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime))
+ bfq_forget_entity(st, entity, is_in_service);
+ else
+ bfq_idle_insert(st, entity);
+
+ return true;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue.
+ * @entity: the entity to deactivate.
+ * @ins_into_idle_tree: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity,
+ bool ins_into_idle_tree,
+ bool expiration)
+{
+ struct bfq_sched_data *sd;
+ struct bfq_entity *parent = NULL;
+
+ for_each_entity_safe(entity, parent) {
+ sd = entity->sched_data;
+
+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) {
+ /*
+ * entity is not in any tree any more, so
+ * this deactivation is a no-op, and there is
+ * nothing to change for upper-level entities
+ * (in case of expiration, this can never
+ * happen).
+ */
+ return;
+ }
+
+ if (sd->next_in_service == entity)
+ /*
+ * entity was the next_in_service entity,
+ * then, since entity has just been
+ * deactivated, a new one must be found.
+ */
+ bfq_update_next_in_service(sd, NULL);
+
+ if (sd->next_in_service)
+ /*
+ * The parent entity is still backlogged,
+ * because next_in_service is not NULL. So, no
+ * further upwards deactivation must be
+ * performed. Yet, next_in_service has
+ * changed. Then the schedule does need to be
+ * updated upwards.
+ */
+ break;
+
+ /*
+ * If we get here, then the parent is no more
+ * backlogged and we need to propagate the
+ * deactivation upwards. Thus let the loop go on.
+ */
+
+ /*
+ * Also let parent be queued into the idle tree on
+ * deactivation, to preserve service guarantees, and
+ * assuming that who invoked this function does not
+ * need parent entities too to be removed completely.
+ */
+ ins_into_idle_tree = true;
+ }
+
+ /*
+ * If the deactivation loop is fully executed, then there are
+ * no more entities to touch and next loop is not executed at
+ * all. Otherwise, requeue remaining entities if they are
+ * about to stop receiving service, or reposition them if this
+ * is not the case.
+ */
+ entity = parent;
+ for_each_entity(entity) {
+ /*
+ * Invoke __bfq_requeue_entity on entity, even if
+ * already active, to requeue/reposition it in the
+ * active tree (because sd->next_in_service has
+ * changed)
+ */
+ __bfq_requeue_entity(entity);
+
+ sd = entity->sched_data;
+ if (!bfq_update_next_in_service(sd, entity) &&
+ !expiration)
+ /*
+ * next_in_service unchanged or not causing
+ * any change in entity->parent->sd, and no
+ * requeueing needed for expiration: stop
+ * here.
+ */
+ break;
+ }
+}
+
+/**
+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump,
+ * if needed, to have at least one entity eligible.
+ * @st: the service tree to act upon.
+ *
+ * Assumes that st is not empty.
+ */
+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st)
+{
+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active);
+
+ if (bfq_gt(root_entity->min_start, st->vtime))
+ return root_entity->min_start;
+
+ return st->vtime;
+}
+
+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value)
+{
+ if (new_value > st->vtime) {
+ st->vtime = new_value;
+ bfq_forget_idle(st);
+ }
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ * the smallest finish time
+ * @st: the service tree to select from.
+ * @vtime: the system virtual to use as a reference for eligibility
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st,
+ u64 vtime)
+{
+ struct bfq_entity *entry, *first = NULL;
+ struct rb_node *node = st->active.rb_node;
+
+ while (node) {
+ entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+ if (!bfq_gt(entry->start, vtime))
+ first = entry;
+
+ if (node->rb_left) {
+ entry = rb_entry(node->rb_left,
+ struct bfq_entity, rb_node);
+ if (!bfq_gt(entry->min_start, vtime)) {
+ node = node->rb_left;
+ goto left;
+ }
+ }
+ if (first)
+ break;
+ node = node->rb_right;
+ }
+
+ return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * If there is no in-service entity for the sched_data st belongs to,
+ * then return the entity that will be set in service if:
+ * 1) the parent entity this st belongs to is set in service;
+ * 2) no entity belonging to such parent entity undergoes a state change
+ * that would influence the timestamps of the entity (e.g., becomes idle,
+ * becomes backlogged, changes its budget, ...).
+ *
+ * In this first case, update the virtual time in @st too (see the
+ * comments on this update inside the function).
+ *
+ * In constrast, if there is an in-service entity, then return the
+ * entity that would be set in service if not only the above
+ * conditions, but also the next one held true: the currently
+ * in-service entity, on expiration,
+ * 1) gets a finish time equal to the current one, or
+ * 2) is not eligible any more, or
+ * 3) is idle.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service)
+{
+ struct bfq_entity *entity;
+ u64 new_vtime;
+
+ if (RB_EMPTY_ROOT(&st->active))
+ return NULL;
+
+ /*
+ * Get the value of the system virtual time for which at
+ * least one entity is eligible.
+ */
+ new_vtime = bfq_calc_vtime_jump(st);
+
+ /*
+ * If there is no in-service entity for the sched_data this
+ * active tree belongs to, then push the system virtual time
+ * up to the value that guarantees that at least one entity is
+ * eligible. If, instead, there is an in-service entity, then
+ * do not make any such update, because there is already an
+ * eligible entity, namely the in-service one (even if the
+ * entity is not on st, because it was extracted when set in
+ * service).
+ */
+ if (!in_service)
+ bfq_update_vtime(st, new_vtime);
+
+ entity = bfq_first_active_entity(st, new_vtime);
+
+ return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ *
+ * This function is invoked when there has been a change in the trees
+ * for sd, and we need know what is the new next entity after this
+ * change.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd)
+{
+ struct bfq_service_tree *st = sd->service_tree;
+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1);
+ struct bfq_entity *entity = NULL;
+ int class_idx = 0;
+
+ /*
+ * Choose from idle class, if needed to guarantee a minimum
+ * bandwidth to this class (and if there is some active entity
+ * in idle class). This should also mitigate
+ * priority-inversion problems in case a low priority task is
+ * holding file system resources.
+ */
+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service +
+ BFQ_CL_IDLE_TIMEOUT)) {
+ if (!RB_EMPTY_ROOT(&idle_class_st->active))
+ class_idx = BFQ_IOPRIO_CLASSES - 1;
+ /* About to be served if backlogged, or not yet backlogged */
+ sd->bfq_class_idle_last_service = jiffies;
+ }
+
+ /*
+ * Find the next entity to serve for the highest-priority
+ * class, unless the idle class needs to be served.
+ */
+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) {
+ entity = __bfq_lookup_next_entity(st + class_idx,
+ sd->in_service_entity);
+
+ if (entity)
+ break;
+ }
+
+ if (!entity)
+ return NULL;
+
+ return entity;
+}
+
+bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+ return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+ struct bfq_entity *entity = NULL;
+ struct bfq_sched_data *sd;
+ struct bfq_queue *bfqq;
+
+ if (bfqd->busy_queues == 0)
+ return NULL;
+
+ /*
+ * Traverse the path from the root to the leaf entity to
+ * serve. Set in service all the entities visited along the
+ * way.
+ */
+ sd = &bfqd->root_group->sched_data;
+ for (; sd ; sd = entity->my_sched_data) {
+ /*
+ * WARNING. We are about to set the in-service entity
+ * to sd->next_in_service, i.e., to the (cached) value
+ * returned by bfq_lookup_next_entity(sd) the last
+ * time it was invoked, i.e., the last time when the
+ * service order in sd changed as a consequence of the
+ * activation or deactivation of an entity. In this
+ * respect, if we execute bfq_lookup_next_entity(sd)
+ * in this very moment, it may, although with low
+ * probability, yield a different entity than that
+ * pointed to by sd->next_in_service. This rare event
+ * happens in case there was no CLASS_IDLE entity to
+ * serve for sd when bfq_lookup_next_entity(sd) was
+ * invoked for the last time, while there is now one
+ * such entity.
+ *
+ * If the above event happens, then the scheduling of
+ * such entity in CLASS_IDLE is postponed until the
+ * service of the sd->next_in_service entity
+ * finishes. In fact, when the latter is expired,
+ * bfq_lookup_next_entity(sd) gets called again,
+ * exactly to update sd->next_in_service.
+ */
+
+ /* Make next_in_service entity become in_service_entity */
+ entity = sd->next_in_service;
+ sd->in_service_entity = entity;
+
+ /*
+ * Reset the accumulator of the amount of service that
+ * the entity is about to receive.
+ */
+ entity->service = 0;
+
+ /*
+ * If entity is no longer a candidate for next
+ * service, then we extract it from its active tree,
+ * for the following reason. To further boost the
+ * throughput in some special case, BFQ needs to know
+ * which is the next candidate entity to serve, while
+ * there is already an entity in service. In this
+ * respect, to make it easy to compute/update the next
+ * candidate entity to serve after the current
+ * candidate has been set in service, there is a case
+ * where it is necessary to extract the current
+ * candidate from its service tree. Such a case is
+ * when the entity just set in service cannot be also
+ * a candidate for next service. Details about when
+ * this conditions holds are reported in the comments
+ * on the function bfq_no_longer_next_in_service()
+ * invoked below.
+ */
+ if (bfq_no_longer_next_in_service(entity))
+ bfq_active_extract(bfq_entity_service_tree(entity),
+ entity);
+
+ /*
+ * For the same reason why we may have just extracted
+ * entity from its active tree, we may need to update
+ * next_in_service for the sched_data of entity too,
+ * regardless of whether entity has been extracted.
+ * In fact, even if entity has not been extracted, a
+ * descendant entity may get extracted. Such an event
+ * would cause a change in next_in_service for the
+ * level of the descendant entity, and thus possibly
+ * back to upper levels.
+ *
+ * We cannot perform the resulting needed update
+ * before the end of this loop, because, to know which
+ * is the correct next-to-serve candidate entity for
+ * each level, we need first to find the leaf entity
+ * to set in service. In fact, only after we know
+ * which is the next-to-serve leaf entity, we can
+ * discover whether the parent entity of the leaf
+ * entity becomes the next-to-serve, and so on.
+ */
+
+ }
+
+ bfqq = bfq_entity_to_bfqq(entity);
+
+ /*
+ * We can finally update all next-to-serve entities along the
+ * path from the leaf entity just set in service to the root.
+ */
+ for_each_entity(entity) {
+ struct bfq_sched_data *sd = entity->sched_data;
+
+ if (!bfq_update_next_in_service(sd, NULL))
+ break;
+ }
+
+ return bfqq;
+}
+
+void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue;
+ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity;
+ struct bfq_entity *entity = in_serv_entity;
+
+ bfq_clear_bfqq_wait_request(in_serv_bfqq);
+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+ bfqd->in_service_queue = NULL;
+
+ /*
+ * When this function is called, all in-service entities have
+ * been properly deactivated or requeued, so we can safely
+ * execute the final step: reset in_service_entity along the
+ * path from entity to the root.
+ */
+ for_each_entity(entity)
+ entity->sched_data->in_service_entity = NULL;
+
+ /*
+ * in_serv_entity is no longer in service, so, if it is in no
+ * service tree either, then release the service reference to
+ * the queue it represents (taken with bfq_get_entity).
+ */
+ if (!in_serv_entity->on_st)
+ bfq_put_queue(in_serv_bfqq);
+}
+
+void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool ins_into_idle_tree, bool expiration)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration);
+}
+
+void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq),
+ false);
+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ bfq_activate_requeue_entity(entity, false,
+ bfqq == bfqd->in_service_queue);
+}
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree. As a special case, it can be invoked during an
+ * expiration.
+ */
+void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ bool expiration)
+{
+ bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+ bfq_clear_bfqq_busy(bfqq);
+
+ bfqd->busy_queues--;
+
+ if (!bfqq->dispatched)
+ bfq_weights_tree_remove(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ if (bfqq->wr_coeff > 1)
+ bfqd->wr_busy_queues--;
+
+ bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+ bfq_activate_bfqq(bfqd, bfqq);
+
+ bfq_mark_bfqq_busy(bfqq);
+ bfqd->busy_queues++;
+
+ if (!bfqq->dispatched)
+ if (bfqq->wr_coeff == 1)
+ bfq_weights_tree_add(bfqd, &bfqq->entity,
+ &bfqd->queue_weights_tree);
+
+ if (bfqq->wr_coeff > 1)
+ bfqd->wr_busy_queues++;
+}
diff --git a/block/bio.c b/block/bio.c
index e75878f8b14af..f4d2071802663 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -30,6 +30,7 @@
#include <linux/cgroup.h>
#include <trace/events/block.h>
+#include "blk.h"
/*
* Test patch to inline a certain number of bi_io_vec's inside the bio
@@ -427,7 +428,8 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
* RETURNS:
* Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+ struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
unsigned front_pad;
@@ -1824,6 +1826,11 @@ static inline bool bio_remaining_done(struct bio *bio)
* bio_endio() will end I/O on the whole bio. bio_endio() is the preferred
* way to end I/O on a bio. No one should call bi_end_io() directly on a
* bio unless they own it and thus know that it has an end_io function.
+ *
+ * bio_endio() can be called several times on a bio that has been chained
+ * using bio_chain(). The ->bi_end_io() function will only be called the
+ * last time. At this point the BLK_TA_COMPLETE tracing event will be
+ * generated if BIO_TRACE_COMPLETION is set.
**/
void bio_endio(struct bio *bio)
{
@@ -1844,6 +1851,13 @@ again:
goto again;
}
+ if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_complete(bdev_get_queue(bio->bi_bdev),
+ bio, bio->bi_error);
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ }
+
+ blk_throtl_bio_endio(bio);
if (bio->bi_end_io)
bio->bi_end_io(bio);
}
@@ -1882,6 +1896,9 @@ struct bio *bio_split(struct bio *bio, int sectors,
bio_advance(bio, split->bi_iter.bi_size);
+ if (bio_flagged(bio, BIO_TRACE_COMPLETION))
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
+
return split;
}
EXPORT_SYMBOL(bio_split);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d7..7c2947128f581 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
}
EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+ const struct blkcg_policy *pol,
+ struct request_queue *q)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ lockdep_assert_held(q->queue_lock);
+
+ if (!blkcg_policy_enabled(q, pol))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * This could be the first entry point of blkcg implementation and
+ * we shouldn't allow anything to go through for a bypassing queue.
+ */
+ if (unlikely(blk_queue_bypass(q)))
+ return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+ return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
__acquires(rcu) __acquires(disk->queue->queue_lock)
{
struct gendisk *disk;
+ struct request_queue *q;
struct blkcg_gq *blkg;
struct module *owner;
unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (!disk)
return -ENODEV;
if (part) {
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- return -ENODEV;
+ ret = -ENODEV;
+ goto fail;
}
- rcu_read_lock();
- spin_lock_irq(disk->queue->queue_lock);
+ q = disk->queue;
- if (blkcg_policy_enabled(disk->queue, pol))
- blkg = blkg_lookup_create(blkcg, disk->queue);
- else
- blkg = ERR_PTR(-EOPNOTSUPP);
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+ blkg = blkg_lookup_check(blkcg, pol, q);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg)
+ goto success;
+
+ /*
+ * Create blkgs walking down from blkcg_root to @blkcg, so that all
+ * non-root blkgs have access to their parents.
+ */
+ while (true) {
+ struct blkcg *pos = blkcg;
+ struct blkcg *parent;
+ struct blkcg_gq *new_blkg;
+
+ parent = blkcg_parent(blkcg);
+ while (parent && !__blkg_lookup(parent, q, false)) {
+ pos = parent;
+ parent = blkcg_parent(parent);
+ }
+
+ /* Drop locks to do new blkg allocation with GFP_KERNEL. */
+ spin_unlock_irq(q->queue_lock);
rcu_read_unlock();
- spin_unlock_irq(disk->queue->queue_lock);
- owner = disk->fops->owner;
- put_disk(disk);
- module_put(owner);
- /*
- * If queue was bypassing, we should retry. Do so after a
- * short msleep(). It isn't strictly necessary but queue
- * can be bypassing for some time and it's always nice to
- * avoid busy looping.
- */
- if (ret == -EBUSY) {
- msleep(10);
- ret = restart_syscall();
+
+ new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+ if (unlikely(!new_blkg)) {
+ ret = -ENOMEM;
+ goto fail;
}
- return ret;
- }
+ rcu_read_lock();
+ spin_lock_irq(q->queue_lock);
+
+ blkg = blkg_lookup_check(pos, pol, q);
+ if (IS_ERR(blkg)) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+
+ if (blkg) {
+ blkg_free(new_blkg);
+ } else {
+ blkg = blkg_create(pos, q, new_blkg);
+ if (unlikely(IS_ERR(blkg))) {
+ ret = PTR_ERR(blkg);
+ goto fail_unlock;
+ }
+ }
+
+ if (pos == blkcg)
+ goto success;
+ }
+success:
ctx->disk = disk;
ctx->blkg = blkg;
ctx->body = body;
return 0;
+
+fail_unlock:
+ spin_unlock_irq(q->queue_lock);
+ rcu_read_unlock();
+fail:
+ owner = disk->fops->owner;
+ put_disk(disk);
+ module_put(owner);
+ /*
+ * If queue was bypassing, we should retry. Do so after a
+ * short msleep(). It isn't strictly necessary but queue
+ * can be bypassing for some time and it's always nice to
+ * avoid busy looping.
+ */
+ if (ret == -EBUSY) {
+ msleep(10);
+ ret = restart_syscall();
+ }
+ return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);
diff --git a/block/blk-core.c b/block/blk-core.c
index d772c221cc178..24886b69690f0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -268,10 +268,8 @@ void blk_sync_queue(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
- queue_for_each_hw_ctx(q, hctx, i) {
- cancel_work_sync(&hctx->run_work);
- cancel_delayed_work_sync(&hctx->delay_work);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
} else {
cancel_delayed_work_sync(&q->delay_work);
}
@@ -500,6 +498,13 @@ void blk_set_queue_dying(struct request_queue *q)
queue_flag_set(QUEUE_FLAG_DYING, q);
spin_unlock_irq(q->queue_lock);
+ /*
+ * When queue DYING flag is set, we need to block new req
+ * entering queue, so we call blk_freeze_queue_start() to
+ * prevent I/O from crossing blk_queue_enter().
+ */
+ blk_freeze_queue_start(q);
+
if (q->mq_ops)
blk_mq_wake_waiters(q);
else {
@@ -556,9 +561,13 @@ void blk_cleanup_queue(struct request_queue *q)
* prevent that q->request_fn() gets invoked after draining finished.
*/
blk_freeze_queue(q);
- spin_lock_irq(lock);
- if (!q->mq_ops)
+ if (!q->mq_ops) {
+ spin_lock_irq(lock);
__blk_drain_queue(q, true);
+ } else {
+ blk_mq_debugfs_unregister_mq(q);
+ spin_lock_irq(lock);
+ }
queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock);
@@ -669,6 +678,15 @@ int blk_queue_enter(struct request_queue *q, bool nowait)
if (nowait)
return -EBUSY;
+ /*
+ * read pair of barrier in blk_freeze_queue_start(),
+ * we need to order reading __PERCPU_REF_DEAD flag of
+ * .q_usage_counter and reading .mq_freeze_depth or
+ * queue dying flag, otherwise the following wait may
+ * never return if the two reads are reordered.
+ */
+ smp_rmb();
+
ret = wait_event_interruptible(q->mq_freeze_wq,
!atomic_read(&q->mq_freeze_depth) ||
blk_queue_dying(q));
@@ -720,6 +738,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q->backing_dev_info)
goto fail_split;
+ q->stats = blk_alloc_queue_stats();
+ if (!q->stats)
+ goto fail_stats;
+
q->backing_dev_info->ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
@@ -776,6 +798,8 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
fail_ref:
percpu_ref_exit(&q->q_usage_counter);
fail_bdi:
+ blk_free_queue_stats(q->stats);
+fail_stats:
bdi_put(q->backing_dev_info);
fail_split:
bioset_free(q->bio_split);
@@ -889,7 +913,6 @@ out_exit_flush_rq:
q->exit_rq_fn(q, q->fq->flush_rq);
out_free_flush_queue:
blk_free_flush_queue(q->fq);
- wbt_exit(q);
return -ENOMEM;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1128,7 +1151,6 @@ static struct request *__get_request(struct request_list *rl, unsigned int op,
blk_rq_init(q, rq);
blk_rq_set_rl(rq, rl);
- blk_rq_set_prio(rq, ioc);
rq->cmd_flags = op;
rq->rq_flags = rq_flags;
@@ -1608,17 +1630,23 @@ out:
return ret;
}
-void init_request_from_bio(struct request *req, struct bio *bio)
+void blk_init_request_from_bio(struct request *req, struct bio *bio)
{
+ struct io_context *ioc = rq_ioc(bio);
+
if (bio->bi_opf & REQ_RAHEAD)
req->cmd_flags |= REQ_FAILFAST_MASK;
- req->errors = 0;
req->__sector = bio->bi_iter.bi_sector;
if (ioprio_valid(bio_prio(bio)))
req->ioprio = bio_prio(bio);
+ else if (ioc)
+ req->ioprio = ioc->ioprio;
+ else
+ req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
blk_rq_bio_prep(req->q, req, bio);
}
+EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
{
@@ -1709,7 +1737,7 @@ get_rq:
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
- init_request_from_bio(req, bio);
+ blk_init_request_from_bio(req, bio);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
req->cpu = raw_smp_processor_id();
@@ -1936,7 +1964,13 @@ generic_make_request_checks(struct bio *bio)
if (!blkcg_bio_issue_check(q, bio))
return false;
- trace_block_bio_queue(q, bio);
+ if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_queue(q, bio);
+ /* Now that enqueuing has been traced, we need to trace
+ * completion as well.
+ */
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
+ }
return true;
not_supported:
@@ -2478,7 +2512,7 @@ void blk_start_request(struct request *req)
blk_dequeue_request(req);
if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
- blk_stat_set_issue_time(&req->issue_stat);
+ blk_stat_set_issue(&req->issue_stat, blk_rq_sectors(req));
req->rq_flags |= RQF_STATS;
wbt_issue(req->q->rq_wb, &req->issue_stat);
}
@@ -2540,22 +2574,11 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
{
int total_bytes;
- trace_block_rq_complete(req->q, req, nr_bytes);
+ trace_block_rq_complete(req, error, nr_bytes);
if (!req->bio)
return false;
- /*
- * For fs requests, rq is just carrier of independent bio's
- * and each partial completion should be handled separately.
- * Reset per-request error on each partial completion.
- *
- * TODO: tj: This is too subtle. It would be better to let
- * low level drivers do what they see fit.
- */
- if (!blk_rq_is_passthrough(req))
- req->errors = 0;
-
if (error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)) {
char *error_type;
@@ -2601,6 +2624,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
if (bio_bytes == bio->bi_iter.bi_size)
req->bio = bio->bi_next;
+ /* Completion has already been traced */
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
req_bio_endio(req, bio, bio_bytes, error);
total_bytes += bio_bytes;
@@ -2699,7 +2724,7 @@ void blk_finish_request(struct request *req, int error)
struct request_queue *q = req->q;
if (req->rq_flags & RQF_STATS)
- blk_stat_add(&q->rq_stats[rq_data_dir(req)], req);
+ blk_stat_add(req);
if (req->rq_flags & RQF_QUEUED)
blk_queue_end_tag(q, req);
@@ -2776,7 +2801,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
* %false - we are done with this request
* %true - still buffers pending for this request
**/
-bool __blk_end_bidi_request(struct request *rq, int error,
+static bool __blk_end_bidi_request(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes)
{
if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
@@ -2829,43 +2854,6 @@ void blk_end_request_all(struct request *rq, int error)
EXPORT_SYMBOL(blk_end_request_all);
/**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @error: %0 for success, < %0 for error
- *
- * Description:
- * Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- * %false - we are done with this request
- * %true - still buffers pending for this request
- */
-bool blk_end_request_cur(struct request *rq, int error)
-{
- return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-EXPORT_SYMBOL(blk_end_request_cur);
-
-/**
- * blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- * Complete @rq till the next failure boundary.
- *
- * Return:
- * %false - we are done with this request
- * %true - still buffers pending for this request
- */
-bool blk_end_request_err(struct request *rq, int error)
-{
- WARN_ON(error >= 0);
- return blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(blk_end_request_err);
-
-/**
* __blk_end_request - Helper function for drivers to complete the request.
* @rq: the request being processed
* @error: %0 for success, < %0 for error
@@ -2924,26 +2912,6 @@ bool __blk_end_request_cur(struct request *rq, int error)
}
EXPORT_SYMBOL(__blk_end_request_cur);
-/**
- * __blk_end_request_err - Finish a request till the next failure boundary.
- * @rq: the request to finish till the next failure boundary for
- * @error: must be negative errno
- *
- * Description:
- * Complete @rq till the next failure boundary. Must be called
- * with queue lock held.
- *
- * Return:
- * %false - we are done with this request
- * %true - still buffers pending for this request
- */
-bool __blk_end_request_err(struct request *rq, int error)
-{
- WARN_ON(error >= 0);
- return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
-}
-EXPORT_SYMBOL_GPL(__blk_end_request_err);
-
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio)
{
@@ -3106,6 +3074,13 @@ int kblockd_schedule_work_on(int cpu, struct work_struct *work)
}
EXPORT_SYMBOL(kblockd_schedule_work_on);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
+ unsigned long delay)
+{
+ return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
+}
+EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+
int kblockd_schedule_delayed_work(struct delayed_work *dwork,
unsigned long delay)
{
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 8cd0e9bc8dc89..a9451e3b85871 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -69,8 +69,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
if (unlikely(blk_queue_dying(q))) {
rq->rq_flags |= RQF_QUIET;
- rq->errors = -ENXIO;
- __blk_end_request_all(rq, rq->errors);
+ __blk_end_request_all(rq, -ENXIO);
spin_unlock_irq(q->queue_lock);
return;
}
@@ -92,11 +91,10 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
* Insert a fully prepared request at the back of the I/O scheduler queue
* for execution and wait for completion.
*/
-int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
+void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq, int at_head)
{
DECLARE_COMPLETION_ONSTACK(wait);
- int err = 0;
unsigned long hang_check;
rq->end_io_data = &wait;
@@ -108,10 +106,5 @@ int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
else
wait_for_completion_io(&wait);
-
- if (rq->errors)
- err = -EIO;
-
- return err;
}
EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 0d5a9c1da1fc7..c4e0880b54bbf 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -447,7 +447,7 @@ void blk_insert_flush(struct request *rq)
if (q->mq_ops)
blk_mq_end_request(rq, 0);
else
- __blk_end_bidi_request(rq, 0, 0, 0);
+ __blk_end_request(rq, 0, 0);
return;
}
@@ -497,8 +497,7 @@ void blk_insert_flush(struct request *rq)
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
- * wish to. If WAIT flag is not passed then caller may check only what
- * request was pushed in some internal queue for later handling.
+ * wish to.
*/
int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
sector_t *error_sector)
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 9f0ff5ba4f84d..0f891a9aff4d6 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -389,7 +389,7 @@ static int blk_integrity_nop_fn(struct blk_integrity_iter *iter)
return 0;
}
-static struct blk_integrity_profile nop_profile = {
+static const struct blk_integrity_profile nop_profile = {
.name = "nop",
.generate_fn = blk_integrity_nop_fn,
.verify_fn = blk_integrity_nop_fn,
@@ -412,12 +412,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE |
template->flags;
- bi->interval_exp = ilog2(queue_logical_block_size(disk->queue));
+ bi->interval_exp = template->interval_exp ? :
+ ilog2(queue_logical_block_size(disk->queue));
bi->profile = template->profile ? template->profile : &nop_profile;
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
- blk_integrity_revalidate(disk);
+ disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
}
EXPORT_SYMBOL(blk_integrity_register);
@@ -430,26 +431,11 @@ EXPORT_SYMBOL(blk_integrity_register);
*/
void blk_integrity_unregister(struct gendisk *disk)
{
- blk_integrity_revalidate(disk);
+ disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
}
EXPORT_SYMBOL(blk_integrity_unregister);
-void blk_integrity_revalidate(struct gendisk *disk)
-{
- struct blk_integrity *bi = &disk->queue->integrity;
-
- if (!(disk->flags & GENHD_FL_UP))
- return;
-
- if (bi->profile)
- disk->queue->backing_dev_info->capabilities |=
- BDI_CAP_STABLE_WRITES;
- else
- disk->queue->backing_dev_info->capabilities &=
- ~BDI_CAP_STABLE_WRITES;
-}
-
void blk_integrity_add(struct gendisk *disk)
{
if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
diff --git a/block/blk-lib.c b/block/blk-lib.c
index ed1e78e24db00..e8caecd71688e 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -37,17 +37,12 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
return -ENXIO;
if (flags & BLKDEV_DISCARD_SECURE) {
- if (flags & BLKDEV_DISCARD_ZERO)
- return -EOPNOTSUPP;
if (!blk_queue_secure_erase(q))
return -EOPNOTSUPP;
op = REQ_OP_SECURE_ERASE;
} else {
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
- if ((flags & BLKDEV_DISCARD_ZERO) &&
- !q->limits.discard_zeroes_data)
- return -EOPNOTSUPP;
op = REQ_OP_DISCARD;
}
@@ -109,7 +104,7 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
* @sector: start sector
* @nr_sects: number of sectors to discard
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @flags: BLKDEV_IFL_* flags to control behaviour
+ * @flags: BLKDEV_DISCARD_* flags to control behaviour
*
* Description:
* Issue a discard request for the sectors in question.
@@ -126,7 +121,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
&bio);
if (!ret && bio) {
ret = submit_bio_wait(bio);
- if (ret == -EOPNOTSUPP && !(flags & BLKDEV_DISCARD_ZERO))
+ if (ret == -EOPNOTSUPP)
ret = 0;
bio_put(bio);
}
@@ -226,20 +221,9 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_write_same);
-/**
- * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
- * @bdev: blockdev to issue
- * @sector: start sector
- * @nr_sects: number of sectors to write
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @biop: pointer to anchor bio
- *
- * Description:
- * Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
- */
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
- struct bio **biop)
+ struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
@@ -258,7 +242,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
bio = next_bio(bio, 0, gfp_mask);
bio->bi_iter.bi_sector = sector;
bio->bi_bdev = bdev;
- bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);
+ bio->bi_opf = REQ_OP_WRITE_ZEROES;
+ if (flags & BLKDEV_ZERO_NOUNMAP)
+ bio->bi_opf |= REQ_NOUNMAP;
if (nr_sects > max_write_zeroes_sectors) {
bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
@@ -282,14 +268,27 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
* @biop: pointer to anchor bio
- * @discard: discard flag
+ * @flags: controls detailed behavior
*
* Description:
- * Generate and issue number of bios with zerofiled pages.
+ * Zero-fill a block range, either using hardware offload or by explicitly
+ * writing zeroes to the device.
+ *
+ * Note that this function may fail with -EOPNOTSUPP if the driver signals
+ * zeroing offload support, but the device fails to process the command (for
+ * some devices there is no non-destructive way to verify whether this
+ * operation is actually supported). In this case the caller should call
+ * retry the call to blkdev_issue_zeroout() and the fallback path will be used.
+ *
+ * If a device is using logical block provisioning, the underlying space will
+ * not be released if %flags contains BLKDEV_ZERO_NOUNMAP.
+ *
+ * If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return
+ * -EOPNOTSUPP if no explicit hardware offload for zeroing is provided.
*/
int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
- bool discard)
+ unsigned flags)
{
int ret;
int bi_size = 0;
@@ -302,8 +301,8 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
return -EINVAL;
ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
- biop);
- if (ret == 0 || (ret && ret != -EOPNOTSUPP))
+ biop, flags);
+ if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK))
goto out;
ret = 0;
@@ -337,40 +336,23 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
* @sector: start sector
* @nr_sects: number of sectors to write
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @discard: whether to discard the block range
+ * @flags: controls detailed behavior
*
* Description:
- * Zero-fill a block range. If the discard flag is set and the block
- * device guarantees that subsequent READ operations to the block range
- * in question will return zeroes, the blocks will be discarded. Should
- * the discard request fail, if the discard flag is not set, or if
- * discard_zeroes_data is not supported, this function will resort to
- * zeroing the blocks manually, thus provisioning (allocating,
- * anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
- * command(s), blkdev_issue_zeroout() will use it to optimize the process of
- * clearing the block range. Otherwise the zeroing will be performed
- * using regular WRITE calls.
+ * Zero-fill a block range, either using hardware offload or by explicitly
+ * writing zeroes to the device. See __blkdev_issue_zeroout() for the
+ * valid values for %flags.
*/
int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, bool discard)
+ sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
{
int ret;
struct bio *bio = NULL;
struct blk_plug plug;
- if (discard) {
- if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask,
- BLKDEV_DISCARD_ZERO))
- return 0;
- }
-
- if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
- ZERO_PAGE(0)))
- return 0;
-
blk_start_plug(&plug);
ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
- &bio, discard);
+ &bio, flags);
if (ret == 0 && bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2afa262425d10..3990ae4063412 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -54,6 +54,20 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
return bio_split(bio, split_sectors, GFP_NOIO, bs);
}
+static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
+ struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+{
+ *nsegs = 1;
+
+ if (!q->limits.max_write_zeroes_sectors)
+ return NULL;
+
+ if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
+ return NULL;
+
+ return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
+}
+
static struct bio *blk_bio_write_same_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -200,8 +214,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
split = blk_bio_discard_split(q, *bio, bs, &nsegs);
break;
case REQ_OP_WRITE_ZEROES:
- split = NULL;
- nsegs = (*bio)->bi_phys_segments;
+ split = blk_bio_write_zeroes_split(q, *bio, bs, &nsegs);
break;
case REQ_OP_WRITE_SAME:
split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f6d917977b331..bcd2a7d4a3a52 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -43,11 +43,157 @@ static int blk_mq_debugfs_seq_open(struct inode *inode, struct file *file,
return ret;
}
+static int blk_flags_show(struct seq_file *m, const unsigned long flags,
+ const char *const *flag_name, int flag_name_count)
+{
+ bool sep = false;
+ int i;
+
+ for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) {
+ if (!(flags & BIT(i)))
+ continue;
+ if (sep)
+ seq_puts(m, " ");
+ sep = true;
+ if (i < flag_name_count && flag_name[i])
+ seq_puts(m, flag_name[i]);
+ else
+ seq_printf(m, "%d", i);
+ }
+ return 0;
+}
+
+static const char *const blk_queue_flag_name[] = {
+ [QUEUE_FLAG_QUEUED] = "QUEUED",
+ [QUEUE_FLAG_STOPPED] = "STOPPED",
+ [QUEUE_FLAG_SYNCFULL] = "SYNCFULL",
+ [QUEUE_FLAG_ASYNCFULL] = "ASYNCFULL",
+ [QUEUE_FLAG_DYING] = "DYING",
+ [QUEUE_FLAG_BYPASS] = "BYPASS",
+ [QUEUE_FLAG_BIDI] = "BIDI",
+ [QUEUE_FLAG_NOMERGES] = "NOMERGES",
+ [QUEUE_FLAG_SAME_COMP] = "SAME_COMP",
+ [QUEUE_FLAG_FAIL_IO] = "FAIL_IO",
+ [QUEUE_FLAG_STACKABLE] = "STACKABLE",
+ [QUEUE_FLAG_NONROT] = "NONROT",
+ [QUEUE_FLAG_IO_STAT] = "IO_STAT",
+ [QUEUE_FLAG_DISCARD] = "DISCARD",
+ [QUEUE_FLAG_NOXMERGES] = "NOXMERGES",
+ [QUEUE_FLAG_ADD_RANDOM] = "ADD_RANDOM",
+ [QUEUE_FLAG_SECERASE] = "SECERASE",
+ [QUEUE_FLAG_SAME_FORCE] = "SAME_FORCE",
+ [QUEUE_FLAG_DEAD] = "DEAD",
+ [QUEUE_FLAG_INIT_DONE] = "INIT_DONE",
+ [QUEUE_FLAG_NO_SG_MERGE] = "NO_SG_MERGE",
+ [QUEUE_FLAG_POLL] = "POLL",
+ [QUEUE_FLAG_WC] = "WC",
+ [QUEUE_FLAG_FUA] = "FUA",
+ [QUEUE_FLAG_FLUSH_NQ] = "FLUSH_NQ",
+ [QUEUE_FLAG_DAX] = "DAX",
+ [QUEUE_FLAG_STATS] = "STATS",
+ [QUEUE_FLAG_POLL_STATS] = "POLL_STATS",
+ [QUEUE_FLAG_REGISTERED] = "REGISTERED",
+};
+
+static int blk_queue_flags_show(struct seq_file *m, void *v)
+{
+ struct request_queue *q = m->private;
+
+ blk_flags_show(m, q->queue_flags, blk_queue_flag_name,
+ ARRAY_SIZE(blk_queue_flag_name));
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static ssize_t blk_queue_flags_store(struct file *file, const char __user *ubuf,
+ size_t len, loff_t *offp)
+{
+ struct request_queue *q = file_inode(file)->i_private;
+ char op[16] = { }, *s;
+
+ len = min(len, sizeof(op) - 1);
+ if (copy_from_user(op, ubuf, len))
+ return -EFAULT;
+ s = op;
+ strsep(&s, " \t\n"); /* strip trailing whitespace */
+ if (strcmp(op, "run") == 0) {
+ blk_mq_run_hw_queues(q, true);
+ } else if (strcmp(op, "start") == 0) {
+ blk_mq_start_stopped_hw_queues(q, true);
+ } else {
+ pr_err("%s: unsupported operation %s. Use either 'run' or 'start'\n",
+ __func__, op);
+ return -EINVAL;
+ }
+ return len;
+}
+
+static int blk_queue_flags_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, blk_queue_flags_show, inode->i_private);
+}
+
+static const struct file_operations blk_queue_flags_fops = {
+ .open = blk_queue_flags_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = blk_queue_flags_store,
+};
+
+static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
+{
+ if (stat->nr_samples) {
+ seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+ stat->nr_samples, stat->mean, stat->min, stat->max);
+ } else {
+ seq_puts(m, "samples=0");
+ }
+}
+
+static int queue_poll_stat_show(struct seq_file *m, void *v)
+{
+ struct request_queue *q = m->private;
+ int bucket;
+
+ for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
+ seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket));
+ print_stat(m, &q->poll_stat[2*bucket]);
+ seq_puts(m, "\n");
+
+ seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket));
+ print_stat(m, &q->poll_stat[2*bucket+1]);
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
+
+static int queue_poll_stat_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, queue_poll_stat_show, inode->i_private);
+}
+
+static const struct file_operations queue_poll_stat_fops = {
+ .open = queue_poll_stat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const char *const hctx_state_name[] = {
+ [BLK_MQ_S_STOPPED] = "STOPPED",
+ [BLK_MQ_S_TAG_ACTIVE] = "TAG_ACTIVE",
+ [BLK_MQ_S_SCHED_RESTART] = "SCHED_RESTART",
+ [BLK_MQ_S_TAG_WAITING] = "TAG_WAITING",
+
+};
static int hctx_state_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
- seq_printf(m, "0x%lx\n", hctx->state);
+ blk_flags_show(m, hctx->state, hctx_state_name,
+ ARRAY_SIZE(hctx_state_name));
+ seq_puts(m, "\n");
return 0;
}
@@ -63,11 +209,35 @@ static const struct file_operations hctx_state_fops = {
.release = single_release,
};
+static const char *const alloc_policy_name[] = {
+ [BLK_TAG_ALLOC_FIFO] = "fifo",
+ [BLK_TAG_ALLOC_RR] = "rr",
+};
+
+static const char *const hctx_flag_name[] = {
+ [ilog2(BLK_MQ_F_SHOULD_MERGE)] = "SHOULD_MERGE",
+ [ilog2(BLK_MQ_F_TAG_SHARED)] = "TAG_SHARED",
+ [ilog2(BLK_MQ_F_SG_MERGE)] = "SG_MERGE",
+ [ilog2(BLK_MQ_F_BLOCKING)] = "BLOCKING",
+ [ilog2(BLK_MQ_F_NO_SCHED)] = "NO_SCHED",
+};
+
static int hctx_flags_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
-
- seq_printf(m, "0x%lx\n", hctx->flags);
+ const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags);
+
+ seq_puts(m, "alloc_policy=");
+ if (alloc_policy < ARRAY_SIZE(alloc_policy_name) &&
+ alloc_policy_name[alloc_policy])
+ seq_puts(m, alloc_policy_name[alloc_policy]);
+ else
+ seq_printf(m, "%d", alloc_policy);
+ seq_puts(m, " ");
+ blk_flags_show(m,
+ hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy),
+ hctx_flag_name, ARRAY_SIZE(hctx_flag_name));
+ seq_puts(m, "\n");
return 0;
}
@@ -83,13 +253,83 @@ static const struct file_operations hctx_flags_fops = {
.release = single_release,
};
+static const char *const op_name[] = {
+ [REQ_OP_READ] = "READ",
+ [REQ_OP_WRITE] = "WRITE",
+ [REQ_OP_FLUSH] = "FLUSH",
+ [REQ_OP_DISCARD] = "DISCARD",
+ [REQ_OP_ZONE_REPORT] = "ZONE_REPORT",
+ [REQ_OP_SECURE_ERASE] = "SECURE_ERASE",
+ [REQ_OP_ZONE_RESET] = "ZONE_RESET",
+ [REQ_OP_WRITE_SAME] = "WRITE_SAME",
+ [REQ_OP_WRITE_ZEROES] = "WRITE_ZEROES",
+ [REQ_OP_SCSI_IN] = "SCSI_IN",
+ [REQ_OP_SCSI_OUT] = "SCSI_OUT",
+ [REQ_OP_DRV_IN] = "DRV_IN",
+ [REQ_OP_DRV_OUT] = "DRV_OUT",
+};
+
+static const char *const cmd_flag_name[] = {
+ [__REQ_FAILFAST_DEV] = "FAILFAST_DEV",
+ [__REQ_FAILFAST_TRANSPORT] = "FAILFAST_TRANSPORT",
+ [__REQ_FAILFAST_DRIVER] = "FAILFAST_DRIVER",
+ [__REQ_SYNC] = "SYNC",
+ [__REQ_META] = "META",
+ [__REQ_PRIO] = "PRIO",
+ [__REQ_NOMERGE] = "NOMERGE",
+ [__REQ_IDLE] = "IDLE",
+ [__REQ_INTEGRITY] = "INTEGRITY",
+ [__REQ_FUA] = "FUA",
+ [__REQ_PREFLUSH] = "PREFLUSH",
+ [__REQ_RAHEAD] = "RAHEAD",
+ [__REQ_BACKGROUND] = "BACKGROUND",
+ [__REQ_NR_BITS] = "NR_BITS",
+};
+
+static const char *const rqf_name[] = {
+ [ilog2((__force u32)RQF_SORTED)] = "SORTED",
+ [ilog2((__force u32)RQF_STARTED)] = "STARTED",
+ [ilog2((__force u32)RQF_QUEUED)] = "QUEUED",
+ [ilog2((__force u32)RQF_SOFTBARRIER)] = "SOFTBARRIER",
+ [ilog2((__force u32)RQF_FLUSH_SEQ)] = "FLUSH_SEQ",
+ [ilog2((__force u32)RQF_MIXED_MERGE)] = "MIXED_MERGE",
+ [ilog2((__force u32)RQF_MQ_INFLIGHT)] = "MQ_INFLIGHT",
+ [ilog2((__force u32)RQF_DONTPREP)] = "DONTPREP",
+ [ilog2((__force u32)RQF_PREEMPT)] = "PREEMPT",
+ [ilog2((__force u32)RQF_COPY_USER)] = "COPY_USER",
+ [ilog2((__force u32)RQF_FAILED)] = "FAILED",
+ [ilog2((__force u32)RQF_QUIET)] = "QUIET",
+ [ilog2((__force u32)RQF_ELVPRIV)] = "ELVPRIV",
+ [ilog2((__force u32)RQF_IO_STAT)] = "IO_STAT",
+ [ilog2((__force u32)RQF_ALLOCED)] = "ALLOCED",
+ [ilog2((__force u32)RQF_PM)] = "PM",
+ [ilog2((__force u32)RQF_HASHED)] = "HASHED",
+ [ilog2((__force u32)RQF_STATS)] = "STATS",
+ [ilog2((__force u32)RQF_SPECIAL_PAYLOAD)] = "SPECIAL_PAYLOAD",
+};
+
static int blk_mq_debugfs_rq_show(struct seq_file *m, void *v)
{
struct request *rq = list_entry_rq(v);
-
- seq_printf(m, "%p {.cmd_flags=0x%x, .rq_flags=0x%x, .tag=%d, .internal_tag=%d}\n",
- rq, rq->cmd_flags, (__force unsigned int)rq->rq_flags,
- rq->tag, rq->internal_tag);
+ const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
+ const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
+
+ seq_printf(m, "%p {.op=", rq);
+ if (op < ARRAY_SIZE(op_name) && op_name[op])
+ seq_printf(m, "%s", op_name[op]);
+ else
+ seq_printf(m, "%d", op);
+ seq_puts(m, ", .cmd_flags=");
+ blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
+ ARRAY_SIZE(cmd_flag_name));
+ seq_puts(m, ", .rq_flags=");
+ blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
+ ARRAY_SIZE(rqf_name));
+ seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag,
+ rq->internal_tag);
+ if (mq_ops->show_rq)
+ mq_ops->show_rq(m, rq);
+ seq_puts(m, "}\n");
return 0;
}
@@ -322,60 +562,6 @@ static const struct file_operations hctx_io_poll_fops = {
.release = single_release,
};
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
- seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
- stat->nr_samples, stat->mean, stat->min, stat->max);
-}
-
-static int hctx_stats_show(struct seq_file *m, void *v)
-{
- struct blk_mq_hw_ctx *hctx = m->private;
- struct blk_rq_stat stat[2];
-
- blk_stat_init(&stat[BLK_STAT_READ]);
- blk_stat_init(&stat[BLK_STAT_WRITE]);
-
- blk_hctx_stat_get(hctx, stat);
-
- seq_puts(m, "read: ");
- print_stat(m, &stat[BLK_STAT_READ]);
- seq_puts(m, "\n");
-
- seq_puts(m, "write: ");
- print_stat(m, &stat[BLK_STAT_WRITE]);
- seq_puts(m, "\n");
- return 0;
-}
-
-static int hctx_stats_open(struct inode *inode, struct file *file)
-{
- return single_open(file, hctx_stats_show, inode->i_private);
-}
-
-static ssize_t hctx_stats_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct seq_file *m = file->private_data;
- struct blk_mq_hw_ctx *hctx = m->private;
- struct blk_mq_ctx *ctx;
- int i;
-
- hctx_for_each_ctx(hctx, ctx, i) {
- blk_stat_init(&ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
- }
- return count;
-}
-
-static const struct file_operations hctx_stats_fops = {
- .open = hctx_stats_open,
- .read = seq_read,
- .write = hctx_stats_write,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int hctx_dispatched_show(struct seq_file *m, void *v)
{
struct blk_mq_hw_ctx *hctx = m->private;
@@ -636,6 +822,12 @@ static const struct file_operations ctx_completed_fops = {
.release = single_release,
};
+static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
+ {"poll_stat", 0400, &queue_poll_stat_fops},
+ {"state", 0600, &blk_queue_flags_fops},
+ {},
+};
+
static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"state", 0400, &hctx_state_fops},
{"flags", 0400, &hctx_flags_fops},
@@ -646,7 +838,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"sched_tags", 0400, &hctx_sched_tags_fops},
{"sched_tags_bitmap", 0400, &hctx_sched_tags_bitmap_fops},
{"io_poll", 0600, &hctx_io_poll_fops},
- {"stats", 0600, &hctx_stats_fops},
{"dispatched", 0600, &hctx_dispatched_fops},
{"queued", 0600, &hctx_queued_fops},
{"run", 0600, &hctx_run_fops},
@@ -662,16 +853,17 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{},
};
-int blk_mq_debugfs_register(struct request_queue *q, const char *name)
+int blk_mq_debugfs_register(struct request_queue *q)
{
if (!blk_debugfs_root)
return -ENOENT;
- q->debugfs_dir = debugfs_create_dir(name, blk_debugfs_root);
+ q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
+ blk_debugfs_root);
if (!q->debugfs_dir)
goto err;
- if (blk_mq_debugfs_register_hctxs(q))
+ if (blk_mq_debugfs_register_mq(q))
goto err;
return 0;
@@ -741,7 +933,7 @@ static int blk_mq_debugfs_register_hctx(struct request_queue *q,
return 0;
}
-int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+int blk_mq_debugfs_register_mq(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
@@ -753,6 +945,9 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
if (!q->mq_debugfs_dir)
goto err;
+ if (!debugfs_create_files(q->mq_debugfs_dir, q, blk_mq_debugfs_queue_attrs))
+ goto err;
+
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_debugfs_register_hctx(q, hctx))
goto err;
@@ -761,11 +956,11 @@ int blk_mq_debugfs_register_hctxs(struct request_queue *q)
return 0;
err:
- blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_debugfs_unregister_mq(q);
return -ENOMEM;
}
-void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+void blk_mq_debugfs_unregister_mq(struct request_queue *q)
{
debugfs_remove_recursive(q->mq_debugfs_dir);
q->mq_debugfs_dir = NULL;
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index 966c2169762eb..0c3354cf35528 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -23,7 +23,7 @@
* @pdev: PCI device associated with @set.
*
* This function assumes the PCI device @pdev has at least as many available
- * interrupt vetors as @set has queues. It will then queuery the vector
+ * interrupt vectors as @set has queues. It will then query the vector
* corresponding to each queue for it's affinity mask and built queue mapping
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c974a1bbf4cba..8b361e192e8a9 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -30,43 +30,6 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
- int (*init)(struct blk_mq_hw_ctx *),
- void (*exit)(struct blk_mq_hw_ctx *))
-{
- struct blk_mq_hw_ctx *hctx;
- int ret;
- int i;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
- if (!hctx->sched_data) {
- ret = -ENOMEM;
- goto error;
- }
-
- if (init) {
- ret = init(hctx);
- if (ret) {
- /*
- * We don't want to give exit() a partially
- * initialized sched_data. init() must clean up
- * if it fails.
- */
- kfree(hctx->sched_data);
- hctx->sched_data = NULL;
- goto error;
- }
- }
- }
-
- return 0;
-error:
- blk_mq_sched_free_hctx_data(q, exit);
- return ret;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
-
static void __blk_mq_sched_assign_ioc(struct request_queue *q,
struct request *rq,
struct bio *bio,
@@ -119,7 +82,11 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
- if (e) {
+ /*
+ * For a reserved tag, allocate a normal request since we might
+ * have driver dependencies on the value of the internal tag.
+ */
+ if (e && !(data->flags & BLK_MQ_REQ_RESERVED)) {
data->flags |= BLK_MQ_REQ_INTERNAL;
/*
@@ -227,22 +194,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
}
-void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
- struct list_head *rq_list,
- struct request *(*get_rq)(struct blk_mq_hw_ctx *))
-{
- do {
- struct request *rq;
-
- rq = get_rq(hctx);
- if (!rq)
- break;
-
- list_add_tail(&rq->queuelist, rq_list);
- } while (1);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
-
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
struct request **merged_request)
{
@@ -508,11 +459,24 @@ int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
struct elevator_queue *e = q->elevator;
+ int ret;
if (!e)
return 0;
- return blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+ ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
+ if (ret)
+ return ret;
+
+ if (e->type->ops.mq.init_hctx) {
+ ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
+ if (ret) {
+ blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
+ return ret;
+ }
+ }
+
+ return 0;
}
void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
@@ -523,12 +487,18 @@ void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
if (!e)
return;
+ if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
+ e->type->ops.mq.exit_hctx(hctx, hctx_idx);
+ hctx->sched_data = NULL;
+ }
+
blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
}
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
struct blk_mq_hw_ctx *hctx;
+ struct elevator_queue *eq;
unsigned int i;
int ret;
@@ -553,6 +523,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (ret)
goto err;
+ if (e->ops.mq.init_hctx) {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ ret = e->ops.mq.init_hctx(hctx, i);
+ if (ret) {
+ eq = q->elevator;
+ blk_mq_exit_sched(q, eq);
+ kobject_put(&eq->kobj);
+ return ret;
+ }
+ }
+ }
+
return 0;
err:
@@ -563,6 +545,17 @@ err:
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
+
+ if (e->type->ops.mq.exit_hctx) {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (hctx->sched_data) {
+ e->type->ops.mq.exit_hctx(hctx, i);
+ hctx->sched_data = NULL;
+ }
+ }
+ }
if (e->type->ops.mq.exit_sched)
e->type->ops.mq.exit_sched(e);
blk_mq_sched_tags_teardown(q);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3a9e6e40558b5..edafb5383b7bb 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -4,10 +4,6 @@
#include "blk-mq.h"
#include "blk-mq-tag.h"
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
- int (*init)(struct blk_mq_hw_ctx *),
- void (*exit)(struct blk_mq_hw_ctx *));
-
void blk_mq_sched_free_hctx_data(struct request_queue *q,
void (*exit)(struct blk_mq_hw_ctx *));
@@ -28,9 +24,6 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
struct list_head *list, bool run_queue_async);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
- struct list_head *rq_list,
- struct request *(*get_rq)(struct blk_mq_hw_ctx *));
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
@@ -86,17 +79,12 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
return true;
}
-static inline void
-blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline void blk_mq_sched_completed_request(struct request *rq)
{
- struct elevator_queue *e = hctx->queue->elevator;
+ struct elevator_queue *e = rq->q->elevator;
if (e && e->type->ops.mq.completed_request)
- e->type->ops.mq.completed_request(hctx, rq);
-
- BUG_ON(rq->internal_tag == -1);
-
- blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+ e->type->ops.mq.completed_request(rq);
}
static inline void blk_mq_sched_started_request(struct request *rq)
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index d745ab81033af..ec0afdf765e39 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -253,10 +253,12 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
+ lockdep_assert_held(&q->sysfs_lock);
+
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
- blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_debugfs_unregister_mq(q);
kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
kobject_del(&q->mq_kobj);
@@ -267,9 +269,9 @@ static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
- blk_mq_disable_hotplug();
+ mutex_lock(&q->sysfs_lock);
__blk_mq_unregister_dev(dev, q);
- blk_mq_enable_hotplug();
+ mutex_unlock(&q->sysfs_lock);
}
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
@@ -302,12 +304,13 @@ void blk_mq_sysfs_init(struct request_queue *q)
}
}
-int blk_mq_register_dev(struct device *dev, struct request_queue *q)
+int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int ret, i;
- blk_mq_disable_hotplug();
+ WARN_ON_ONCE(!q->kobj.parent);
+ lockdep_assert_held(&q->sysfs_lock);
ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -315,20 +318,38 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
kobject_uevent(&q->mq_kobj, KOBJ_ADD);
- blk_mq_debugfs_register(q, kobject_name(&dev->kobj));
+ blk_mq_debugfs_register(q);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
if (ret)
- break;
+ goto unreg;
}
- if (ret)
- __blk_mq_unregister_dev(dev, q);
- else
- q->mq_sysfs_init_done = true;
+ q->mq_sysfs_init_done = true;
+
out:
- blk_mq_enable_hotplug();
+ return ret;
+
+unreg:
+ while (--i >= 0)
+ blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+
+ blk_mq_debugfs_unregister_mq(q);
+
+ kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
+ kobject_del(&q->mq_kobj);
+ kobject_put(&dev->kobj);
+ return ret;
+}
+
+int blk_mq_register_dev(struct device *dev, struct request_queue *q)
+{
+ int ret;
+
+ mutex_lock(&q->sysfs_lock);
+ ret = __blk_mq_register_dev(dev, q);
+ mutex_unlock(&q->sysfs_lock);
return ret;
}
@@ -339,13 +360,17 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
+ mutex_lock(&q->sysfs_lock);
if (!q->mq_sysfs_init_done)
- return;
+ goto unlock;
- blk_mq_debugfs_unregister_hctxs(q);
+ blk_mq_debugfs_unregister_mq(q);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
+
+unlock:
+ mutex_unlock(&q->sysfs_lock);
}
int blk_mq_sysfs_register(struct request_queue *q)
@@ -353,10 +378,11 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
+ mutex_lock(&q->sysfs_lock);
if (!q->mq_sysfs_init_done)
- return ret;
+ goto unlock;
- blk_mq_debugfs_register_hctxs(q);
+ blk_mq_debugfs_register_mq(q);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_register_hctx(hctx);
@@ -364,5 +390,8 @@ int blk_mq_sysfs_register(struct request_queue *q)
break;
}
+unlock:
+ mutex_unlock(&q->sysfs_lock);
+
return ret;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9d97bfc4d4657..d0be72ccb0914 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -96,7 +96,10 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
!hctx_may_queue(data->hctx, bt))
return -1;
- return __sbitmap_queue_get(bt);
+ if (data->shallow_depth)
+ return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+ else
+ return __sbitmap_queue_get(bt);
}
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c7836a1ded973..bf90684a007a2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,26 @@
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
+static void blk_mq_poll_stats_start(struct request_queue *q);
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+
+static int blk_mq_poll_stats_bkt(const struct request *rq)
+{
+ int ddir, bytes, bucket;
+
+ ddir = rq_data_dir(rq);
+ bytes = blk_rq_bytes(rq);
+
+ bucket = ddir + 2*(ilog2(bytes) - 9);
+
+ if (bucket < 0)
+ return -1;
+ else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
+ return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
+
+ return bucket;
+}
+
/*
* Check if any of the ctx's have pending work in this hardware queue
*/
@@ -65,7 +85,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
}
-void blk_mq_freeze_queue_start(struct request_queue *q)
+void blk_freeze_queue_start(struct request_queue *q)
{
int freeze_depth;
@@ -75,7 +95,7 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
blk_mq_run_hw_queues(q, false);
}
}
-EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
+EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
void blk_mq_freeze_queue_wait(struct request_queue *q)
{
@@ -105,7 +125,7 @@ void blk_freeze_queue(struct request_queue *q)
* no blk_unfreeze_queue(), and blk_freeze_queue() is not
* exported to drivers as the only user for unfreeze is blk_mq.
*/
- blk_mq_freeze_queue_start(q);
+ blk_freeze_queue_start(q);
blk_mq_freeze_queue_wait(q);
}
@@ -210,7 +230,6 @@ void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
#endif
rq->special = NULL;
/* tag was already set */
- rq->errors = 0;
rq->extra_len = 0;
INIT_LIST_HEAD(&rq->timeout_list);
@@ -347,7 +366,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
if (rq->tag != -1)
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
- blk_mq_sched_completed_request(hctx, rq);
+ blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
@@ -365,6 +384,7 @@ void blk_mq_finish_request(struct request *rq)
{
blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
}
+EXPORT_SYMBOL_GPL(blk_mq_finish_request);
void blk_mq_free_request(struct request *rq)
{
@@ -402,12 +422,19 @@ static void __blk_mq_complete_request_remote(void *data)
rq->q->softirq_done_fn(rq);
}
-static void blk_mq_ipi_complete_request(struct request *rq)
+static void __blk_mq_complete_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
bool shared = false;
int cpu;
+ if (rq->internal_tag != -1)
+ blk_mq_sched_completed_request(rq);
+ if (rq->rq_flags & RQF_STATS) {
+ blk_mq_poll_stats_start(rq->q);
+ blk_stat_add(rq);
+ }
+
if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
rq->q->softirq_done_fn(rq);
return;
@@ -428,33 +455,6 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
}
-static void blk_mq_stat_add(struct request *rq)
-{
- if (rq->rq_flags & RQF_STATS) {
- /*
- * We could rq->mq_ctx here, but there's less of a risk
- * of races if we have the completion event add the stats
- * to the local software queue.
- */
- struct blk_mq_ctx *ctx;
-
- ctx = __blk_mq_get_ctx(rq->q, raw_smp_processor_id());
- blk_stat_add(&ctx->stat[rq_data_dir(rq)], rq);
- }
-}
-
-static void __blk_mq_complete_request(struct request *rq)
-{
- struct request_queue *q = rq->q;
-
- blk_mq_stat_add(rq);
-
- if (!q->softirq_done_fn)
- blk_mq_end_request(rq, rq->errors);
- else
- blk_mq_ipi_complete_request(rq);
-}
-
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
@@ -463,16 +463,14 @@ static void __blk_mq_complete_request(struct request *rq)
* Ends all I/O on a request. It does not handle partial completions.
* The actual completion happens out-of-order, through a IPI handler.
**/
-void blk_mq_complete_request(struct request *rq, int error)
+void blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
if (unlikely(blk_should_fake_timeout(q)))
return;
- if (!blk_mark_rq_complete(rq)) {
- rq->errors = error;
+ if (!blk_mark_rq_complete(rq))
__blk_mq_complete_request(rq);
- }
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -491,7 +489,7 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- blk_stat_set_issue_time(&rq->issue_stat);
+ blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
rq->rq_flags |= RQF_STATS;
wbt_issue(q->rq_wb, &rq->issue_stat);
}
@@ -526,6 +524,15 @@ void blk_mq_start_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_start_request);
+/*
+ * When we reach here because queue is busy, REQ_ATOM_COMPLETE
+ * flag isn't set yet, so there may be race with timeout handler,
+ * but given rq->deadline is just set in .queue_rq() under
+ * this situation, the race won't be possible in reality because
+ * rq->timeout should be set as big enough to cover the window
+ * between blk_mq_start_request() called from .queue_rq() and
+ * clearing REQ_ATOM_STARTED here.
+ */
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -633,8 +640,7 @@ void blk_mq_abort_requeue_list(struct request_queue *q)
rq = list_first_entry(&rq_list, struct request, queuelist);
list_del_init(&rq->queuelist);
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, -EIO);
}
}
EXPORT_SYMBOL(blk_mq_abort_requeue_list);
@@ -666,7 +672,7 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved)
* just be ignored. This can happen due to the bitflag ordering.
* Timeout first checks if STARTED is set, and if it is, assumes
* the request is active. But if we race with completion, then
- * we both flags will get cleared. So check here again, and ignore
+ * both flags will get cleared. So check here again, and ignore
* a timeout event with a request that isn't active.
*/
if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
@@ -699,6 +705,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
return;
+ /*
+ * The rq being checked may have been freed and reallocated
+ * out already here, we avoid this race by checking rq->deadline
+ * and REQ_ATOM_COMPLETE flag together:
+ *
+ * - if rq->deadline is observed as new value because of
+ * reusing, the rq won't be timed out because of timing.
+ * - if rq->deadline is observed as previous value,
+ * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
+ * because we put a barrier between setting rq->deadline
+ * and clearing the flag in blk_mq_start_request(), so
+ * this rq won't be timed out too.
+ */
if (time_after_eq(jiffies, rq->deadline)) {
if (!blk_mark_rq_complete(rq))
blk_mq_rq_timed_out(rq, reserved);
@@ -727,7 +746,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
* percpu_ref_tryget directly, because we need to be able to
* obtain a reference even in the short window between the queue
* starting to freeze, by dropping the first reference in
- * blk_mq_freeze_queue_start, and the moment the last request is
+ * blk_freeze_queue_start, and the moment the last request is
* consumed, marked by the instant q_usage_counter reaches
* zero.
*/
@@ -845,6 +864,8 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
};
+ might_sleep_if(wait);
+
if (rq->tag != -1)
goto done;
@@ -964,20 +985,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq;
- LIST_HEAD(driver_list);
- struct list_head *dptr;
int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
if (list_empty(list))
return false;
/*
- * Start off with dptr being NULL, so we start the first request
- * immediately, even if we have more pending.
- */
- dptr = NULL;
-
- /*
* Now process all the entries, sending them to the driver.
*/
errors = queued = 0;
@@ -993,23 +1006,21 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed.
*/
- if (blk_mq_dispatch_wait_add(hctx)) {
- /*
- * It's possible that a tag was freed in the
- * window between the allocation failure and
- * adding the hardware queue to the wait queue.
- */
- if (!blk_mq_get_driver_tag(rq, &hctx, false))
- break;
- } else {
+ if (!blk_mq_dispatch_wait_add(hctx))
+ break;
+
+ /*
+ * It's possible that a tag was freed in the window
+ * between the allocation failure and adding the
+ * hardware queue to the wait queue.
+ */
+ if (!blk_mq_get_driver_tag(rq, &hctx, false))
break;
- }
}
list_del_init(&rq->queuelist);
bd.rq = rq;
- bd.list = dptr;
/*
* Flag last if we have no more requests, or if we have more
@@ -1038,20 +1049,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
pr_err("blk-mq: bad return on queue: %d\n", ret);
case BLK_MQ_RQ_QUEUE_ERROR:
errors++;
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, -EIO);
break;
}
if (ret == BLK_MQ_RQ_QUEUE_BUSY)
break;
-
- /*
- * We've done the first request. If we have more than 1
- * left in the list, set dptr to defer issue.
- */
- if (!dptr && list->next != list->prev)
- dptr = &driver_list;
} while (!list_empty(list));
hctx->dispatched[queued_to_index(queued)]++;
@@ -1062,8 +1065,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
*/
if (!list_empty(list)) {
/*
- * If we got a driver tag for the next request already,
- * free it again.
+ * If an I/O scheduler has been configured and we got a driver
+ * tag for the next request already, free it again.
*/
rq = list_first_entry(list, struct request, queuelist);
blk_mq_put_driver_tag(rq);
@@ -1073,16 +1076,24 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
spin_unlock(&hctx->lock);
/*
- * the queue is expected stopped with BLK_MQ_RQ_QUEUE_BUSY, but
- * it's possible the queue is stopped and restarted again
- * before this. Queue restart will dispatch requests. And since
- * requests in rq_list aren't added into hctx->dispatch yet,
- * the requests in rq_list might get lost.
+ * If SCHED_RESTART was set by the caller of this function and
+ * it is no longer set that means that it was cleared by another
+ * thread and hence that a queue rerun is needed.
*
- * blk_mq_run_hw_queue() already checks the STOPPED bit
+ * If TAG_WAITING is set that means that an I/O scheduler has
+ * been configured and another thread is waiting for a driver
+ * tag. To guarantee fairness, do not rerun this hardware queue
+ * but let the other thread grab the driver tag.
*
- * If RESTART or TAG_WAITING is set, then let completion restart
- * the queue instead of potentially looping here.
+ * If no I/O scheduler has been configured it is possible that
+ * the hardware queue got stopped and restarted before requests
+ * were pushed back onto the dispatch list. Rerun the queue to
+ * avoid starvation. Notes:
+ * - blk_mq_run_hw_queue() checks whether or not a queue has
+ * been stopped before rerunning a queue.
+ * - Some but not all block drivers stop a queue before
+ * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
+ * and dm-rq.
*/
if (!blk_mq_sched_needs_restart(hctx) &&
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
@@ -1104,6 +1115,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_dispatch_requests(hctx);
rcu_read_unlock();
} else {
+ might_sleep();
+
srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
blk_mq_sched_dispatch_requests(hctx);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
@@ -1153,13 +1166,9 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
put_cpu();
}
- if (msecs == 0)
- kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->run_work);
- else
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->delayed_run_work,
- msecs_to_jiffies(msecs));
+ kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->run_work,
+ msecs_to_jiffies(msecs));
}
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@ -1172,6 +1181,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
__blk_mq_delay_run_hw_queue(hctx, async, 0);
}
+EXPORT_SYMBOL(blk_mq_run_hw_queue);
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
@@ -1210,8 +1220,7 @@ EXPORT_SYMBOL(blk_mq_queue_stopped);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- cancel_work(&hctx->run_work);
- cancel_delayed_work(&hctx->delay_work);
+ cancel_delayed_work_sync(&hctx->run_work);
set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);
@@ -1268,38 +1277,40 @@ static void blk_mq_run_work_fn(struct work_struct *work)
{
struct blk_mq_hw_ctx *hctx;
- hctx = container_of(work, struct blk_mq_hw_ctx, run_work);
-
- __blk_mq_run_hw_queue(hctx);
-}
+ hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-static void blk_mq_delayed_run_work_fn(struct work_struct *work)
-{
- struct blk_mq_hw_ctx *hctx;
+ /*
+ * If we are stopped, don't run the queue. The exception is if
+ * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
+ * the STOPPED bit and run it.
+ */
+ if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
+ if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
+ return;
- hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);
+ clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+ clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
+ }
__blk_mq_run_hw_queue(hctx);
}
-static void blk_mq_delay_work_fn(struct work_struct *work)
-{
- struct blk_mq_hw_ctx *hctx;
-
- hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
-
- if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
- __blk_mq_run_hw_queue(hctx);
-}
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
return;
+ /*
+ * Stop the hw queue, then modify currently delayed work.
+ * This should prevent us from running the queue prematurely.
+ * Mark the queue as auto-clearing STOPPED when it runs.
+ */
blk_mq_stop_hw_queue(hctx);
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->delay_work, msecs_to_jiffies(msecs));
+ set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+ kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
+ &hctx->run_work,
+ msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_queue);
@@ -1408,7 +1419,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
{
- init_request_from_bio(rq, bio);
+ blk_init_request_from_bio(rq, bio);
blk_account_io_start(rq, true);
}
@@ -1453,14 +1464,13 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
-static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
+static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
bool may_sleep)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
- .list = NULL,
- .last = 1
+ .last = true,
};
struct blk_mq_hw_ctx *hctx;
blk_qc_t new_cookie;
@@ -1485,31 +1495,42 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie,
return;
}
- __blk_mq_requeue_request(rq);
-
if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
*cookie = BLK_QC_T_NONE;
- rq->errors = -EIO;
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, -EIO);
return;
}
+ __blk_mq_requeue_request(rq);
insert:
blk_mq_sched_insert_request(rq, false, true, false, may_sleep);
}
-/*
- * Multiple hardware queue variant. This will not use per-process plugs,
- * but will attempt to bypass the hctx queueing if we can go straight to
- * hardware for SYNC IO.
- */
+static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, blk_qc_t *cookie)
+{
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+ rcu_read_lock();
+ __blk_mq_try_issue_directly(rq, cookie, false);
+ rcu_read_unlock();
+ } else {
+ unsigned int srcu_idx;
+
+ might_sleep();
+
+ srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
+ __blk_mq_try_issue_directly(rq, cookie, true);
+ srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
+ }
+}
+
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = { .flags = 0 };
struct request *rq;
- unsigned int request_count = 0, srcu_idx;
+ unsigned int request_count = 0;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
@@ -1545,147 +1566,21 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
cookie = request_to_qc_t(data.hctx, rq);
- if (unlikely(is_flush_fua)) {
- if (q->elevator)
- goto elv_insert;
- blk_mq_bio_to_request(rq, bio);
- blk_insert_flush(rq);
- goto run_queue;
- }
-
plug = current->plug;
- /*
- * If the driver supports defer issued based on 'last', then
- * queue it up like normal since we can potentially save some
- * CPU this way.
- */
- if (((plug && !blk_queue_nomerges(q)) || is_sync) &&
- !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
- struct request *old_rq = NULL;
-
- blk_mq_bio_to_request(rq, bio);
-
- /*
- * We do limited plugging. If the bio can be merged, do that.
- * Otherwise the existing request in the plug list will be
- * issued. So the plug list will have one request at most
- */
- if (plug) {
- /*
- * The plug list might get flushed before this. If that
- * happens, same_queue_rq is invalid and plug list is
- * empty
- */
- if (same_queue_rq && !list_empty(&plug->mq_list)) {
- old_rq = same_queue_rq;
- list_del_init(&old_rq->queuelist);
- }
- list_add_tail(&rq->queuelist, &plug->mq_list);
- } else /* is_sync */
- old_rq = rq;
+ if (unlikely(is_flush_fua)) {
blk_mq_put_ctx(data.ctx);
- if (!old_rq)
- goto done;
-
- if (!(data.hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- blk_mq_try_issue_directly(old_rq, &cookie, false);
- rcu_read_unlock();
+ blk_mq_bio_to_request(rq, bio);
+ if (q->elevator) {
+ blk_mq_sched_insert_request(rq, false, true, true,
+ true);
} else {
- srcu_idx = srcu_read_lock(&data.hctx->queue_rq_srcu);
- blk_mq_try_issue_directly(old_rq, &cookie, true);
- srcu_read_unlock(&data.hctx->queue_rq_srcu, srcu_idx);
+ blk_insert_flush(rq);
+ blk_mq_run_hw_queue(data.hctx, true);
}
- goto done;
- }
-
- if (q->elevator) {
-elv_insert:
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true,
- !is_sync || is_flush_fua, true);
- goto done;
- }
- if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
- /*
- * For a SYNC request, send it to the hardware immediately. For
- * an ASYNC request, just ensure that we run it later on. The
- * latter allows for merging opportunities and more efficient
- * dispatching.
- */
-run_queue:
- blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
- }
- blk_mq_put_ctx(data.ctx);
-done:
- return cookie;
-}
-
-/*
- * Single hardware queue variant. This will attempt to use any per-process
- * plug for merging and IO deferral.
- */
-static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
-{
- const int is_sync = op_is_sync(bio->bi_opf);
- const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_plug *plug;
- unsigned int request_count = 0;
- struct blk_mq_alloc_data data = { .flags = 0 };
- struct request *rq;
- blk_qc_t cookie;
- unsigned int wb_acct;
-
- blk_queue_bounce(q, &bio);
-
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
-
- blk_queue_split(q, &bio, q->bio_split);
-
- if (!is_flush_fua && !blk_queue_nomerges(q)) {
- if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
- return BLK_QC_T_NONE;
- } else
- request_count = blk_plug_queued_count(q);
-
- if (blk_mq_sched_bio_merge(q, bio))
- return BLK_QC_T_NONE;
-
- wb_acct = wbt_wait(q->rq_wb, bio, NULL);
-
- trace_block_getrq(q, bio, bio->bi_opf);
-
- rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
- if (unlikely(!rq)) {
- __wbt_done(q->rq_wb, wb_acct);
- return BLK_QC_T_NONE;
- }
-
- wbt_track(&rq->issue_stat, wb_acct);
-
- cookie = request_to_qc_t(data.hctx, rq);
-
- if (unlikely(is_flush_fua)) {
- if (q->elevator)
- goto elv_insert;
- blk_mq_bio_to_request(rq, bio);
- blk_insert_flush(rq);
- goto run_queue;
- }
-
- /*
- * A task plug currently exists. Since this is completely lockless,
- * utilize that to temporarily store requests until the task is
- * either done or scheduled away.
- */
- plug = current->plug;
- if (plug) {
+ } else if (plug && q->nr_hw_queues == 1) {
struct request *last = NULL;
+ blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
/*
@@ -1694,13 +1589,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
*/
if (list_empty(&plug->mq_list))
request_count = 0;
+ else if (blk_queue_nomerges(q))
+ request_count = blk_plug_queued_count(q);
+
if (!request_count)
trace_block_plug(q);
else
last = list_entry_rq(plug->mq_list.prev);
- blk_mq_put_ctx(data.ctx);
-
if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
blk_flush_plug_list(plug, false);
@@ -1708,30 +1604,41 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
}
list_add_tail(&rq->queuelist, &plug->mq_list);
- return cookie;
- }
-
- if (q->elevator) {
-elv_insert:
- blk_mq_put_ctx(data.ctx);
+ } else if (plug && !blk_queue_nomerges(q)) {
blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true,
- !is_sync || is_flush_fua, true);
- goto done;
- }
- if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+
/*
- * For a SYNC request, send it to the hardware immediately. For
- * an ASYNC request, just ensure that we run it later on. The
- * latter allows for merging opportunities and more efficient
- * dispatching.
+ * We do limited plugging. If the bio can be merged, do that.
+ * Otherwise the existing request in the plug list will be
+ * issued. So the plug list will have one request at most
+ * The plug list might get flushed before this. If that happens,
+ * the plug list is empty, and same_queue_rq is invalid.
*/
-run_queue:
- blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
- }
+ if (list_empty(&plug->mq_list))
+ same_queue_rq = NULL;
+ if (same_queue_rq)
+ list_del_init(&same_queue_rq->queuelist);
+ list_add_tail(&rq->queuelist, &plug->mq_list);
+
+ blk_mq_put_ctx(data.ctx);
+
+ if (same_queue_rq)
+ blk_mq_try_issue_directly(data.hctx, same_queue_rq,
+ &cookie);
+ } else if (q->nr_hw_queues > 1 && is_sync) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+ } else if (q->elevator) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_bio_to_request(rq, bio);
+ blk_mq_sched_insert_request(rq, false, true, true, true);
+ } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
+ blk_mq_put_ctx(data.ctx);
+ blk_mq_run_hw_queue(data.hctx, true);
+ } else
+ blk_mq_put_ctx(data.ctx);
- blk_mq_put_ctx(data.ctx);
-done:
return cookie;
}
@@ -1988,9 +1895,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (node == NUMA_NO_NODE)
node = hctx->numa_node = set->numa_node;
- INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
- INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
- INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
+ INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
@@ -2067,8 +1972,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
- blk_stat_init(&__ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&__ctx->stat[BLK_STAT_WRITE]);
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
@@ -2215,6 +2118,8 @@ static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared)
{
struct request_queue *q;
+ lockdep_assert_held(&set->tag_list_lock);
+
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_freeze_queue(q);
queue_set_hctx_shared(q, shared);
@@ -2227,7 +2132,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
- list_del_init(&q->tag_set_list);
+ list_del_rcu(&q->tag_set_list);
+ INIT_LIST_HEAD(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2235,6 +2141,8 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
blk_mq_update_tag_set_depth(set, false);
}
mutex_unlock(&set->tag_list_lock);
+
+ synchronize_rcu();
}
static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
@@ -2252,7 +2160,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
}
if (set->flags & BLK_MQ_F_TAG_SHARED)
queue_set_hctx_shared(q, true);
- list_add_tail(&q->tag_set_list, &set->tag_list);
+ list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
@@ -2364,6 +2272,12 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;
+ q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
+ blk_mq_poll_stats_bkt,
+ BLK_MQ_POLL_STATS_BKTS, q);
+ if (!q->poll_cb)
+ goto err_exit;
+
q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!q->queue_ctx)
goto err_exit;
@@ -2398,10 +2312,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
- if (q->nr_hw_queues > 1)
- blk_queue_make_request(q, blk_mq_make_request);
- else
- blk_queue_make_request(q, blk_sq_make_request);
+ blk_queue_make_request(q, blk_mq_make_request);
/*
* Do this after blk_queue_make_request() overrides it...
@@ -2456,8 +2367,6 @@ void blk_mq_free_queue(struct request_queue *q)
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);
- wbt_exit(q);
-
blk_mq_del_queue_tag_set(q);
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
@@ -2502,7 +2411,7 @@ static void blk_mq_queue_reinit_work(void)
* take place in parallel.
*/
list_for_each_entry(q, &all_q_list, all_q_node)
- blk_mq_freeze_queue_start(q);
+ blk_freeze_queue_start(q);
list_for_each_entry(q, &all_q_list, all_q_node)
blk_mq_freeze_queue_wait(q);
@@ -2743,6 +2652,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
struct request_queue *q;
+ lockdep_assert_held(&set->tag_list_lock);
+
if (nr_hw_queues > nr_cpu_ids)
nr_hw_queues = nr_cpu_ids;
if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
@@ -2755,16 +2666,6 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
-
- /*
- * Manually set the make_request_fn as blk_queue_make_request
- * resets a lot of the queue settings.
- */
- if (q->nr_hw_queues > 1)
- q->make_request_fn = blk_mq_make_request;
- else
- q->make_request_fn = blk_sq_make_request;
-
blk_mq_queue_reinit(q, cpu_online_mask);
}
@@ -2773,39 +2674,69 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
+/* Enable polling stats and return whether they were already enabled. */
+static bool blk_poll_stats_enable(struct request_queue *q)
+{
+ if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+ return true;
+ blk_stat_add_callback(q, q->poll_cb);
+ return false;
+}
+
+static void blk_mq_poll_stats_start(struct request_queue *q)
+{
+ /*
+ * We don't arm the callback if polling stats are not enabled or the
+ * callback is already active.
+ */
+ if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
+ blk_stat_is_active(q->poll_cb))
+ return;
+
+ blk_stat_activate_msecs(q->poll_cb, 100);
+}
+
+static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
+{
+ struct request_queue *q = cb->data;
+ int bucket;
+
+ for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
+ if (cb->stat[bucket].nr_samples)
+ q->poll_stat[bucket] = cb->stat[bucket];
+ }
+}
+
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct blk_rq_stat stat[2];
unsigned long ret = 0;
+ int bucket;
/*
* If stats collection isn't on, don't sleep but turn it on for
* future users
*/
- if (!blk_stat_enable(q))
+ if (!blk_poll_stats_enable(q))
return 0;
/*
- * We don't have to do this once per IO, should optimize this
- * to just use the current window of stats until it changes
- */
- memset(&stat, 0, sizeof(stat));
- blk_hctx_stat_get(hctx, stat);
-
- /*
* As an optimistic guess, use half of the mean service time
* for this type of request. We can (and should) make this smarter.
* For instance, if the completion latencies are tight, we can
* get closer than just half the mean. This is especially
* important on devices where the completion latencies are longer
- * than ~10 usec.
+ * than ~10 usec. We do use the stats for the relevant IO size
+ * if available which does lead to better estimates.
*/
- if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
- ret = (stat[BLK_STAT_READ].mean + 1) / 2;
- else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
- ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;
+ bucket = blk_mq_poll_stats_bkt(rq);
+ if (bucket < 0)
+ return ret;
+
+ if (q->poll_stat[bucket].nr_samples)
+ ret = (q->poll_stat[bucket].mean + 1) / 2;
return ret;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 660a17e1d0331..2814a14e529cd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -20,7 +20,6 @@ struct blk_mq_ctx {
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
- struct blk_rq_stat stat[2];
struct request_queue *queue;
struct kobject kobj;
@@ -79,6 +78,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
+extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
@@ -87,13 +87,12 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
* debugfs helpers
*/
#ifdef CONFIG_BLK_DEBUG_FS
-int blk_mq_debugfs_register(struct request_queue *q, const char *name);
+int blk_mq_debugfs_register(struct request_queue *q);
void blk_mq_debugfs_unregister(struct request_queue *q);
-int blk_mq_debugfs_register_hctxs(struct request_queue *q);
-void blk_mq_debugfs_unregister_hctxs(struct request_queue *q);
+int blk_mq_debugfs_register_mq(struct request_queue *q);
+void blk_mq_debugfs_unregister_mq(struct request_queue *q);
#else
-static inline int blk_mq_debugfs_register(struct request_queue *q,
- const char *name)
+static inline int blk_mq_debugfs_register(struct request_queue *q)
{
return 0;
}
@@ -102,12 +101,12 @@ static inline void blk_mq_debugfs_unregister(struct request_queue *q)
{
}
-static inline int blk_mq_debugfs_register_hctxs(struct request_queue *q)
+static inline int blk_mq_debugfs_register_mq(struct request_queue *q)
{
return 0;
}
-static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
+static inline void blk_mq_debugfs_unregister_mq(struct request_queue *q)
{
}
#endif
@@ -142,6 +141,7 @@ struct blk_mq_alloc_data {
/* input parameter */
struct request_queue *q;
unsigned int flags;
+ unsigned int shallow_depth;
/* input & output parameter */
struct blk_mq_ctx *ctx;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1e7174ffc9d49..4fa81ed383cab 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -103,7 +103,6 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->discard_granularity = 0;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
- lim->discard_zeroes_data = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
lim->alignment_offset = 0;
@@ -127,7 +126,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
blk_set_default_limits(lim);
/* Inherit limits from component devices */
- lim->discard_zeroes_data = 1;
lim->max_segments = USHRT_MAX;
lim->max_discard_segments = 1;
lim->max_hw_sectors = UINT_MAX;
@@ -609,7 +607,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
t->cluster &= b->cluster;
- t->discard_zeroes_data &= b->discard_zeroes_data;
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 186fcb981e9b1..6c2f40940439c 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -4,10 +4,27 @@
* Copyright (C) 2016 Jens Axboe
*/
#include <linux/kernel.h>
+#include <linux/rculist.h>
#include <linux/blk-mq.h>
#include "blk-stat.h"
#include "blk-mq.h"
+#include "blk.h"
+
+#define BLK_RQ_STAT_BATCH 64
+
+struct blk_queue_stats {
+ struct list_head callbacks;
+ spinlock_t lock;
+ bool enable_accounting;
+};
+
+static void blk_stat_init(struct blk_rq_stat *stat)
+{
+ stat->min = -1ULL;
+ stat->max = stat->nr_samples = stat->mean = 0;
+ stat->batch = stat->nr_batch = 0;
+}
static void blk_stat_flush_batch(struct blk_rq_stat *stat)
{
@@ -48,209 +65,185 @@ static void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
dst->nr_samples += src->nr_samples;
}
-static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+static void __blk_stat_add(struct blk_rq_stat *stat, u64 value)
{
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- uint64_t latest = 0;
- int i, j, nr;
-
- blk_stat_init(&dst[BLK_STAT_READ]);
- blk_stat_init(&dst[BLK_STAT_WRITE]);
-
- nr = 0;
- do {
- uint64_t newest = 0;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
-
- if (!ctx->stat[BLK_STAT_READ].nr_samples &&
- !ctx->stat[BLK_STAT_WRITE].nr_samples)
- continue;
- if (ctx->stat[BLK_STAT_READ].time > newest)
- newest = ctx->stat[BLK_STAT_READ].time;
- if (ctx->stat[BLK_STAT_WRITE].time > newest)
- newest = ctx->stat[BLK_STAT_WRITE].time;
- }
- }
+ stat->min = min(stat->min, value);
+ stat->max = max(stat->max, value);
- /*
- * No samples
- */
- if (!newest)
- break;
-
- if (newest > latest)
- latest = newest;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- if (ctx->stat[BLK_STAT_READ].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_READ],
- &ctx->stat[BLK_STAT_READ]);
- nr++;
- }
- if (ctx->stat[BLK_STAT_WRITE].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_WRITE],
- &ctx->stat[BLK_STAT_WRITE]);
- nr++;
- }
- }
- }
- /*
- * If we race on finding an entry, just loop back again.
- * Should be very rare.
- */
- } while (!nr);
+ if (stat->batch + value < stat->batch ||
+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+ blk_stat_flush_batch(stat);
- dst[BLK_STAT_READ].time = dst[BLK_STAT_WRITE].time = latest;
+ stat->batch += value;
+ stat->nr_batch++;
}
-void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+void blk_stat_add(struct request *rq)
{
- if (q->mq_ops)
- blk_mq_stat_get(q, dst);
- else {
- blk_stat_flush_batch(&q->rq_stats[BLK_STAT_READ]);
- blk_stat_flush_batch(&q->rq_stats[BLK_STAT_WRITE]);
- memcpy(&dst[BLK_STAT_READ], &q->rq_stats[BLK_STAT_READ],
- sizeof(struct blk_rq_stat));
- memcpy(&dst[BLK_STAT_WRITE], &q->rq_stats[BLK_STAT_WRITE],
- sizeof(struct blk_rq_stat));
+ struct request_queue *q = rq->q;
+ struct blk_stat_callback *cb;
+ struct blk_rq_stat *stat;
+ int bucket;
+ s64 now, value;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ value = now - blk_stat_time(&rq->issue_stat);
+
+ blk_throtl_stat_add(rq, value);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
+ if (blk_stat_is_active(cb)) {
+ bucket = cb->bucket_fn(rq);
+ if (bucket < 0)
+ continue;
+ stat = &this_cpu_ptr(cb->cpu_stat)[bucket];
+ __blk_stat_add(stat, value);
+ }
}
+ rcu_read_unlock();
}
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+static void blk_stat_timer_fn(unsigned long data)
{
- struct blk_mq_ctx *ctx;
- unsigned int i, nr;
+ struct blk_stat_callback *cb = (void *)data;
+ unsigned int bucket;
+ int cpu;
- nr = 0;
- do {
- uint64_t newest = 0;
+ for (bucket = 0; bucket < cb->buckets; bucket++)
+ blk_stat_init(&cb->stat[bucket]);
- hctx_for_each_ctx(hctx, ctx, i) {
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_READ]);
- blk_stat_flush_batch(&ctx->stat[BLK_STAT_WRITE]);
+ for_each_online_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
- if (!ctx->stat[BLK_STAT_READ].nr_samples &&
- !ctx->stat[BLK_STAT_WRITE].nr_samples)
- continue;
-
- if (ctx->stat[BLK_STAT_READ].time > newest)
- newest = ctx->stat[BLK_STAT_READ].time;
- if (ctx->stat[BLK_STAT_WRITE].time > newest)
- newest = ctx->stat[BLK_STAT_WRITE].time;
+ cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+ for (bucket = 0; bucket < cb->buckets; bucket++) {
+ blk_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]);
+ blk_stat_init(&cpu_stat[bucket]);
}
+ }
- if (!newest)
- break;
-
- hctx_for_each_ctx(hctx, ctx, i) {
- if (ctx->stat[BLK_STAT_READ].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_READ],
- &ctx->stat[BLK_STAT_READ]);
- nr++;
- }
- if (ctx->stat[BLK_STAT_WRITE].time == newest) {
- blk_stat_sum(&dst[BLK_STAT_WRITE],
- &ctx->stat[BLK_STAT_WRITE]);
- nr++;
- }
- }
- /*
- * If we race on finding an entry, just loop back again.
- * Should be very rare, as the window is only updated
- * occasionally
- */
- } while (!nr);
+ cb->timer_fn(cb);
}
-static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+ int (*bucket_fn)(const struct request *),
+ unsigned int buckets, void *data)
{
- stat->min = -1ULL;
- stat->max = stat->nr_samples = stat->mean = 0;
- stat->batch = stat->nr_batch = 0;
- stat->time = time_now & BLK_STAT_NSEC_MASK;
-}
+ struct blk_stat_callback *cb;
-void blk_stat_init(struct blk_rq_stat *stat)
-{
- __blk_stat_init(stat, ktime_to_ns(ktime_get()));
-}
+ cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+ if (!cb)
+ return NULL;
-static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
-{
- return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+ cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat),
+ GFP_KERNEL);
+ if (!cb->stat) {
+ kfree(cb);
+ return NULL;
+ }
+ cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat),
+ __alignof__(struct blk_rq_stat));
+ if (!cb->cpu_stat) {
+ kfree(cb->stat);
+ kfree(cb);
+ return NULL;
+ }
+
+ cb->timer_fn = timer_fn;
+ cb->bucket_fn = bucket_fn;
+ cb->data = data;
+ cb->buckets = buckets;
+ setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+
+ return cb;
}
+EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
-bool blk_stat_is_current(struct blk_rq_stat *stat)
+void blk_stat_add_callback(struct request_queue *q,
+ struct blk_stat_callback *cb)
{
- return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+ unsigned int bucket;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct blk_rq_stat *cpu_stat;
+
+ cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu);
+ for (bucket = 0; bucket < cb->buckets; bucket++)
+ blk_stat_init(&cpu_stat[bucket]);
+ }
+
+ spin_lock(&q->stats->lock);
+ list_add_tail_rcu(&cb->list, &q->stats->callbacks);
+ set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
}
+EXPORT_SYMBOL_GPL(blk_stat_add_callback);
-void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+void blk_stat_remove_callback(struct request_queue *q,
+ struct blk_stat_callback *cb)
{
- s64 now, value;
+ spin_lock(&q->stats->lock);
+ list_del_rcu(&cb->list);
+ if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+ clear_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
- now = __blk_stat_time(ktime_to_ns(ktime_get()));
- if (now < blk_stat_time(&rq->issue_stat))
- return;
-
- if (!__blk_stat_is_current(stat, now))
- __blk_stat_init(stat, now);
+ del_timer_sync(&cb->timer);
+}
+EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
- value = now - blk_stat_time(&rq->issue_stat);
- if (value > stat->max)
- stat->max = value;
- if (value < stat->min)
- stat->min = value;
+static void blk_stat_free_callback_rcu(struct rcu_head *head)
+{
+ struct blk_stat_callback *cb;
- if (stat->batch + value < stat->batch ||
- stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
- blk_stat_flush_batch(stat);
+ cb = container_of(head, struct blk_stat_callback, rcu);
+ free_percpu(cb->cpu_stat);
+ kfree(cb->stat);
+ kfree(cb);
+}
- stat->batch += value;
- stat->nr_batch++;
+void blk_stat_free_callback(struct blk_stat_callback *cb)
+{
+ if (cb)
+ call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}
+EXPORT_SYMBOL_GPL(blk_stat_free_callback);
-void blk_stat_clear(struct request_queue *q)
+void blk_stat_enable_accounting(struct request_queue *q)
{
- if (q->mq_ops) {
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- int i, j;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx_for_each_ctx(hctx, ctx, j) {
- blk_stat_init(&ctx->stat[BLK_STAT_READ]);
- blk_stat_init(&ctx->stat[BLK_STAT_WRITE]);
- }
- }
- } else {
- blk_stat_init(&q->rq_stats[BLK_STAT_READ]);
- blk_stat_init(&q->rq_stats[BLK_STAT_WRITE]);
- }
+ spin_lock(&q->stats->lock);
+ q->stats->enable_accounting = true;
+ set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
+ spin_unlock(&q->stats->lock);
}
-void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+struct blk_queue_stats *blk_alloc_queue_stats(void)
{
- stat->time = (stat->time & BLK_STAT_MASK) |
- (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+ struct blk_queue_stats *stats;
+
+ stats = kmalloc(sizeof(*stats), GFP_KERNEL);
+ if (!stats)
+ return NULL;
+
+ INIT_LIST_HEAD(&stats->callbacks);
+ spin_lock_init(&stats->lock);
+ stats->enable_accounting = false;
+
+ return stats;
}
-/*
- * Enable stat tracking, return whether it was enabled
- */
-bool blk_stat_enable(struct request_queue *q)
+void blk_free_queue_stats(struct blk_queue_stats *stats)
{
- if (!test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
- set_bit(QUEUE_FLAG_STATS, &q->queue_flags);
- return false;
- }
+ if (!stats)
+ return;
+
+ WARN_ON(!list_empty(&stats->callbacks));
- return true;
+ kfree(stats);
}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index a2050a0a5314b..2fb20d1a341a8 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -1,33 +1,85 @@
#ifndef BLK_STAT_H
#define BLK_STAT_H
-/*
- * ~0.13s window as a power-of-2 (2^27 nsecs)
- */
-#define BLK_STAT_NSEC 134217728ULL
-#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1)
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/ktime.h>
+#include <linux/rcupdate.h>
+#include <linux/timer.h>
/*
- * Upper 3 bits can be used elsewhere
+ * from upper:
+ * 3 bits: reserved for other usage
+ * 12 bits: size
+ * 49 bits: time
*/
#define BLK_STAT_RES_BITS 3
-#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS)
-#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1)
-#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK
+#define BLK_STAT_SIZE_BITS 12
+#define BLK_STAT_RES_SHIFT (64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_SIZE_SHIFT (BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
+#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SIZE_SHIFT) - 1)
+#define BLK_STAT_SIZE_MASK \
+ (((1ULL << BLK_STAT_SIZE_BITS) - 1) << BLK_STAT_SIZE_SHIFT)
+#define BLK_STAT_RES_MASK (~((1ULL << BLK_STAT_RES_SHIFT) - 1))
+
+/**
+ * struct blk_stat_callback - Block statistics callback.
+ *
+ * A &struct blk_stat_callback is associated with a &struct request_queue. While
+ * @timer is active, that queue's request completion latencies are sorted into
+ * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the
+ * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked.
+ */
+struct blk_stat_callback {
+ /*
+ * @list: RCU list of callbacks for a &struct request_queue.
+ */
+ struct list_head list;
+
+ /**
+ * @timer: Timer for the next callback invocation.
+ */
+ struct timer_list timer;
+
+ /**
+ * @cpu_stat: Per-cpu statistics buckets.
+ */
+ struct blk_rq_stat __percpu *cpu_stat;
+
+ /**
+ * @bucket_fn: Given a request, returns which statistics bucket it
+ * should be accounted under. Return -1 for no bucket for this
+ * request.
+ */
+ int (*bucket_fn)(const struct request *);
+
+ /**
+ * @buckets: Number of statistics buckets.
+ */
+ unsigned int buckets;
+
+ /**
+ * @stat: Array of statistics buckets.
+ */
+ struct blk_rq_stat *stat;
+
+ /**
+ * @fn: Callback function.
+ */
+ void (*timer_fn)(struct blk_stat_callback *);
+
+ /**
+ * @data: Private pointer for the user.
+ */
+ void *data;
-enum {
- BLK_STAT_READ = 0,
- BLK_STAT_WRITE,
+ struct rcu_head rcu;
};
-void blk_stat_add(struct blk_rq_stat *, struct request *);
-void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
-void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
-void blk_stat_clear(struct request_queue *);
-void blk_stat_init(struct blk_rq_stat *);
-bool blk_stat_is_current(struct blk_rq_stat *);
-void blk_stat_set_issue_time(struct blk_issue_stat *);
-bool blk_stat_enable(struct request_queue *);
+struct blk_queue_stats *blk_alloc_queue_stats(void);
+void blk_free_queue_stats(struct blk_queue_stats *);
+
+void blk_stat_add(struct request *);
static inline u64 __blk_stat_time(u64 time)
{
@@ -36,7 +88,117 @@ static inline u64 __blk_stat_time(u64 time)
static inline u64 blk_stat_time(struct blk_issue_stat *stat)
{
- return __blk_stat_time(stat->time);
+ return __blk_stat_time(stat->stat);
+}
+
+static inline sector_t blk_capped_size(sector_t size)
+{
+ return size & ((1ULL << BLK_STAT_SIZE_BITS) - 1);
+}
+
+static inline sector_t blk_stat_size(struct blk_issue_stat *stat)
+{
+ return (stat->stat & BLK_STAT_SIZE_MASK) >> BLK_STAT_SIZE_SHIFT;
+}
+
+static inline void blk_stat_set_issue(struct blk_issue_stat *stat,
+ sector_t size)
+{
+ stat->stat = (stat->stat & BLK_STAT_RES_MASK) |
+ (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK) |
+ (((u64)blk_capped_size(size)) << BLK_STAT_SIZE_SHIFT);
+}
+
+/* record time/size info in request but not add a callback */
+void blk_stat_enable_accounting(struct request_queue *q);
+
+/**
+ * blk_stat_alloc_callback() - Allocate a block statistics callback.
+ * @timer_fn: Timer callback function.
+ * @bucket_fn: Bucket callback function.
+ * @buckets: Number of statistics buckets.
+ * @data: Value for the @data field of the &struct blk_stat_callback.
+ *
+ * See &struct blk_stat_callback for details on the callback functions.
+ *
+ * Return: &struct blk_stat_callback on success or NULL on ENOMEM.
+ */
+struct blk_stat_callback *
+blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
+ int (*bucket_fn)(const struct request *),
+ unsigned int buckets, void *data);
+
+/**
+ * blk_stat_add_callback() - Add a block statistics callback to be run on a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * Note that a single &struct blk_stat_callback can only be added to a single
+ * &struct request_queue.
+ */
+void blk_stat_add_callback(struct request_queue *q,
+ struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_remove_callback() - Remove a block statistics callback from a
+ * request queue.
+ * @q: The request queue.
+ * @cb: The callback.
+ *
+ * When this returns, the callback is not running on any CPUs and will not be
+ * called again unless readded.
+ */
+void blk_stat_remove_callback(struct request_queue *q,
+ struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_free_callback() - Free a block statistics callback.
+ * @cb: The callback.
+ *
+ * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must
+ * not be associated with a request queue. I.e., if it was previously added with
+ * blk_stat_add_callback(), it must also have been removed since then with
+ * blk_stat_remove_callback().
+ */
+void blk_stat_free_callback(struct blk_stat_callback *cb);
+
+/**
+ * blk_stat_is_active() - Check if a block statistics callback is currently
+ * gathering statistics.
+ * @cb: The callback.
+ */
+static inline bool blk_stat_is_active(struct blk_stat_callback *cb)
+{
+ return timer_pending(&cb->timer);
+}
+
+/**
+ * blk_stat_activate_nsecs() - Gather block statistics during a time window in
+ * nanoseconds.
+ * @cb: The callback.
+ * @nsecs: Number of nanoseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
+ u64 nsecs)
+{
+ mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
+}
+
+/**
+ * blk_stat_activate_msecs() - Gather block statistics during a time window in
+ * milliseconds.
+ * @cb: The callback.
+ * @msecs: Number of milliseconds to gather statistics for.
+ *
+ * The timer callback will be called when the window expires.
+ */
+static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb,
+ unsigned int msecs)
+{
+ mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs));
}
#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 37f0b3ad635ea..3f37813ccbafd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -208,7 +208,7 @@ static ssize_t queue_discard_max_store(struct request_queue *q,
static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_discard_zeroes_data(q), page);
+ return queue_var_show(0, page);
}
static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
@@ -503,26 +503,6 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
-static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
-{
- return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
- pre, (long long) stat->nr_samples,
- (long long) stat->mean, (long long) stat->min,
- (long long) stat->max);
-}
-
-static ssize_t queue_stats_show(struct request_queue *q, char *page)
-{
- struct blk_rq_stat stat[2];
- ssize_t ret;
-
- blk_queue_stat_get(q, stat);
-
- ret = print_stat(page, &stat[BLK_STAT_READ], "read :");
- ret += print_stat(page + ret, &stat[BLK_STAT_WRITE], "write:");
- return ret;
-}
-
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -691,17 +671,20 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
};
-static struct queue_sysfs_entry queue_stats_entry = {
- .attr = {.name = "stats", .mode = S_IRUGO },
- .show = queue_stats_show,
-};
-
static struct queue_sysfs_entry queue_wb_lat_entry = {
.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
.show = queue_wb_lat_show,
.store = queue_wb_lat_store,
};
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static struct queue_sysfs_entry throtl_sample_time_entry = {
+ .attr = {.name = "throttle_sample_time", .mode = S_IRUGO | S_IWUSR },
+ .show = blk_throtl_sample_time_show,
+ .store = blk_throtl_sample_time_store,
+};
+#endif
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -733,9 +716,11 @@ static struct attribute *default_attrs[] = {
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_dax_entry.attr,
- &queue_stats_entry.attr,
&queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ &throtl_sample_time_entry.attr,
+#endif
NULL,
};
@@ -810,7 +795,9 @@ static void blk_release_queue(struct kobject *kobj)
struct request_queue *q =
container_of(kobj, struct request_queue, kobj);
- wbt_exit(q);
+ if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+ blk_stat_remove_callback(q, q->poll_cb);
+ blk_stat_free_callback(q->poll_cb);
bdi_put(q->backing_dev_info);
blkcg_exit_queue(q);
@@ -819,6 +806,8 @@ static void blk_release_queue(struct kobject *kobj)
elevator_exit(q, q->elevator);
}
+ blk_free_queue_stats(q->stats);
+
blk_exit_rl(&q->root_rl);
if (q->queue_tags)
@@ -855,23 +844,6 @@ struct kobj_type blk_queue_ktype = {
.release = blk_release_queue,
};
-static void blk_wb_init(struct request_queue *q)
-{
-#ifndef CONFIG_BLK_WBT_MQ
- if (q->mq_ops)
- return;
-#endif
-#ifndef CONFIG_BLK_WBT_SQ
- if (q->request_fn)
- return;
-#endif
-
- /*
- * If this fails, we don't get throttling
- */
- wbt_init(q);
-}
-
int blk_register_queue(struct gendisk *disk)
{
int ret;
@@ -881,6 +853,11 @@ int blk_register_queue(struct gendisk *disk)
if (WARN_ON(!q))
return -ENXIO;
+ WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+ "%s is registering an already registered queue\n",
+ kobject_name(&dev->kobj));
+ queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+
/*
* SCSI probing may synchronously create and destroy a lot of
* request_queues for non-existent devices. Shutting down a fully
@@ -900,9 +877,6 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
return ret;
- if (q->mq_ops)
- blk_mq_register_dev(dev, q);
-
/* Prevent changes through sysfs until registration is completed. */
mutex_lock(&q->sysfs_lock);
@@ -912,9 +886,14 @@ int blk_register_queue(struct gendisk *disk)
goto unlock;
}
+ if (q->mq_ops)
+ __blk_mq_register_dev(dev, q);
+
kobject_uevent(&q->kobj, KOBJ_ADD);
- blk_wb_init(q);
+ wbt_enable_default(q);
+
+ blk_throtl_register_queue(q);
if (q->request_fn || (q->mq_ops && q->elevator)) {
ret = elv_register_queue(q);
@@ -939,6 +918,11 @@ void blk_unregister_queue(struct gendisk *disk)
if (WARN_ON(!q))
return;
+ queue_flag_clear_unlocked(QUEUE_FLAG_REGISTERED, q);
+
+ wbt_exit(q);
+
+
if (q->mq_ops)
blk_mq_unregister_dev(disk_to_dev(disk), q);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8fab716e40596..b78db2e5fdff1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -18,8 +18,17 @@ static int throtl_grp_quantum = 8;
/* Total max dispatch from all groups in one round */
static int throtl_quantum = 32;
-/* Throttling is performed over 100ms slice and after that slice is renewed */
-static unsigned long throtl_slice = HZ/10; /* 100 ms */
+/* Throttling is performed over a slice and after that slice is renewed */
+#define DFL_THROTL_SLICE_HD (HZ / 10)
+#define DFL_THROTL_SLICE_SSD (HZ / 50)
+#define MAX_THROTL_SLICE (HZ)
+#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
+#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
+#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
+/* default latency target is 0, eg, guarantee IO latency by default */
+#define DFL_LATENCY_TARGET (0)
+
+#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)
static struct blkcg_policy blkcg_policy_throtl;
@@ -83,6 +92,12 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
+enum {
+ LIMIT_LOW,
+ LIMIT_MAX,
+ LIMIT_CNT,
+};
+
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
@@ -119,20 +134,54 @@ struct throtl_grp {
/* are there any throtl rules between this group and td? */
bool has_rules[2];
- /* bytes per second rate limits */
- uint64_t bps[2];
+ /* internally used bytes per second rate limits */
+ uint64_t bps[2][LIMIT_CNT];
+ /* user configured bps limits */
+ uint64_t bps_conf[2][LIMIT_CNT];
- /* IOPS limits */
- unsigned int iops[2];
+ /* internally used IOPS limits */
+ unsigned int iops[2][LIMIT_CNT];
+ /* user configured IOPS limits */
+ unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes disptached in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
+ unsigned long last_low_overflow_time[2];
+
+ uint64_t last_bytes_disp[2];
+ unsigned int last_io_disp[2];
+
+ unsigned long last_check_time;
+
+ unsigned long latency_target; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
+
+ unsigned long last_finish_time; /* ns / 1024 */
+ unsigned long checked_last_finish_time; /* ns / 1024 */
+ unsigned long avg_idletime; /* ns / 1024 */
+ unsigned long idletime_threshold; /* us */
+
+ unsigned int bio_cnt; /* total bios */
+ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+ unsigned long bio_cnt_reset_time;
+};
+
+/* We measure latency for request size from <= 4k to >= 1M */
+#define LATENCY_BUCKET_SIZE 9
+
+struct latency_bucket {
+ unsigned long total_latency; /* ns / 1024 */
+ int samples;
+};
+
+struct avg_latency_bucket {
+ unsigned long latency; /* ns / 1024 */
+ bool valid;
};
struct throtl_data
@@ -145,8 +194,26 @@ struct throtl_data
/* Total Number of queued bios on READ and WRITE lists */
unsigned int nr_queued[2];
+ unsigned int throtl_slice;
+
/* Work for dispatching throttled bios */
struct work_struct dispatch_work;
+ unsigned int limit_index;
+ bool limit_valid[LIMIT_CNT];
+
+ unsigned long dft_idletime_threshold; /* us */
+
+ unsigned long low_upgrade_time;
+ unsigned long low_downgrade_time;
+
+ unsigned int scale;
+
+ struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
+ struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
+ struct latency_bucket __percpu *latency_buckets;
+ unsigned long last_calculate_time;
+
+ bool track_bio_latency;
};
static void throtl_pending_timer_fn(unsigned long arg);
@@ -198,6 +265,76 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
return container_of(sq, struct throtl_data, service_queue);
}
+/*
+ * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
+ * make the IO dispatch more smooth.
+ * Scale up: linearly scale up according to lapsed time since upgrade. For
+ * every throtl_slice, the limit scales up 1/2 .low limit till the
+ * limit hits .max limit
+ * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
+ */
+static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
+{
+ /* arbitrary value to avoid too big scale */
+ if (td->scale < 4096 && time_after_eq(jiffies,
+ td->low_upgrade_time + td->scale * td->throtl_slice))
+ td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
+
+ return low + (low >> 1) * td->scale;
+}
+
+static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
+{
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
+ uint64_t ret;
+
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+ return U64_MAX;
+
+ td = tg->td;
+ ret = tg->bps[rw][td->limit_index];
+ if (ret == 0 && td->limit_index == LIMIT_LOW)
+ return tg->bps[rw][LIMIT_MAX];
+
+ if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
+ tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
+ uint64_t adjusted;
+
+ adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
+ ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
+ }
+ return ret;
+}
+
+static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
+{
+ struct blkcg_gq *blkg = tg_to_blkg(tg);
+ struct throtl_data *td;
+ unsigned int ret;
+
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
+ return UINT_MAX;
+ td = tg->td;
+ ret = tg->iops[rw][td->limit_index];
+ if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
+ return tg->iops[rw][LIMIT_MAX];
+
+ if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
+ tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
+ uint64_t adjusted;
+
+ adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
+ if (adjusted > UINT_MAX)
+ adjusted = UINT_MAX;
+ ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
+ }
+ return ret;
+}
+
+#define request_bucket_index(sectors) \
+ clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
+
/**
* throtl_log - log debug message via blktrace
* @sq: the service_queue being reported
@@ -334,10 +471,17 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
}
RB_CLEAR_NODE(&tg->rb_node);
- tg->bps[READ] = -1;
- tg->bps[WRITE] = -1;
- tg->iops[READ] = -1;
- tg->iops[WRITE] = -1;
+ tg->bps[READ][LIMIT_MAX] = U64_MAX;
+ tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
+ tg->iops[READ][LIMIT_MAX] = UINT_MAX;
+ tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
+ tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
+ tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
+ tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
+ tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
+ /* LIMIT_LOW will have default value 0 */
+
+ tg->latency_target = DFL_LATENCY_TARGET;
return &tg->pd;
}
@@ -366,6 +510,8 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
tg->td = td;
+
+ tg->idletime_threshold = td->dft_idletime_threshold;
}
/*
@@ -376,20 +522,59 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
static void tg_update_has_rules(struct throtl_grp *tg)
{
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
+ struct throtl_data *td = tg->td;
int rw;
for (rw = READ; rw <= WRITE; rw++)
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
- (tg->bps[rw] != -1 || tg->iops[rw] != -1);
+ (td->limit_valid[td->limit_index] &&
+ (tg_bps_limit(tg, rw) != U64_MAX ||
+ tg_iops_limit(tg, rw) != UINT_MAX));
}
static void throtl_pd_online(struct blkg_policy_data *pd)
{
+ struct throtl_grp *tg = pd_to_tg(pd);
/*
* We don't want new groups to escape the limits of its ancestors.
* Update has_rules[] after a new group is brought online.
*/
- tg_update_has_rules(pd_to_tg(pd));
+ tg_update_has_rules(tg);
+}
+
+static void blk_throtl_update_limit_valid(struct throtl_data *td)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+ bool low_valid = false;
+
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
+ tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ low_valid = true;
+ }
+ rcu_read_unlock();
+
+ td->limit_valid[LIMIT_LOW] = low_valid;
+}
+
+static void throtl_upgrade_state(struct throtl_data *td);
+static void throtl_pd_offline(struct blkg_policy_data *pd)
+{
+ struct throtl_grp *tg = pd_to_tg(pd);
+
+ tg->bps[READ][LIMIT_LOW] = 0;
+ tg->bps[WRITE][LIMIT_LOW] = 0;
+ tg->iops[READ][LIMIT_LOW] = 0;
+ tg->iops[WRITE][LIMIT_LOW] = 0;
+
+ blk_throtl_update_limit_valid(tg->td);
+
+ if (!tg->td->limit_valid[tg->td->limit_index])
+ throtl_upgrade_state(tg->td);
}
static void throtl_pd_free(struct blkg_policy_data *pd)
@@ -499,6 +684,17 @@ static void throtl_dequeue_tg(struct throtl_grp *tg)
static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
unsigned long expires)
{
+ unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice;
+
+ /*
+ * Since we are adjusting the throttle limit dynamically, the sleep
+ * time calculated according to previous limit might be invalid. It's
+ * possible the cgroup sleep time is very long and no other cgroups
+ * have IO running so notify the limit changes. Make sure the cgroup
+ * doesn't sleep too long to avoid the missed notification.
+ */
+ if (time_after(expires, max_expire))
+ expires = max_expire;
mod_timer(&sq->pending_timer, expires);
throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
expires - jiffies, jiffies);
@@ -556,7 +752,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
if (time_after_eq(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
- tg->slice_end[rw] = jiffies + throtl_slice;
+ tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -568,7 +764,7 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
- tg->slice_end[rw] = jiffies + throtl_slice;
+ tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -578,13 +774,13 @@ static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+ tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
}
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
+ tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -624,19 +820,20 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
* is bad because it does not allow new slice to start.
*/
- throtl_set_slice_end(tg, rw, jiffies + throtl_slice);
+ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
time_elapsed = jiffies - tg->slice_start[rw];
- nr_slices = time_elapsed / throtl_slice;
+ nr_slices = time_elapsed / tg->td->throtl_slice;
if (!nr_slices)
return;
- tmp = tg->bps[rw] * throtl_slice * nr_slices;
+ tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
do_div(tmp, HZ);
bytes_trim = tmp;
- io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
+ io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
+ HZ;
if (!bytes_trim && !io_trim)
return;
@@ -651,7 +848,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
else
tg->io_disp[rw] = 0;
- tg->slice_start[rw] += nr_slices * throtl_slice;
+ tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
throtl_log(&tg->service_queue,
"[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
@@ -671,9 +868,9 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed)
- jiffy_elapsed_rnd = throtl_slice;
+ jiffy_elapsed_rnd = tg->td->throtl_slice;
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -682,7 +879,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
* have been trimmed.
*/
- tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
+ tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
@@ -697,7 +894,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
}
/* Calc approx time to dispatch */
- jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
+ jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1;
if (jiffy_wait > jiffy_elapsed)
jiffy_wait = jiffy_wait - jiffy_elapsed;
@@ -720,11 +917,11 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Slice has just started. Consider one slice interval */
if (!jiffy_elapsed)
- jiffy_elapsed_rnd = throtl_slice;
+ jiffy_elapsed_rnd = tg->td->throtl_slice;
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
- tmp = tg->bps[rw] * jiffy_elapsed_rnd;
+ tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
do_div(tmp, HZ);
bytes_allowed = tmp;
@@ -736,7 +933,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed;
- jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
+ jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
if (!jiffy_wait)
jiffy_wait = 1;
@@ -771,7 +968,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
+ if (tg_bps_limit(tg, rw) == U64_MAX &&
+ tg_iops_limit(tg, rw) == UINT_MAX) {
if (wait)
*wait = 0;
return true;
@@ -787,8 +985,10 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
throtl_start_new_slice(tg, rw);
else {
- if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
- throtl_extend_slice(tg, rw, jiffies + throtl_slice);
+ if (time_before(tg->slice_end[rw],
+ jiffies + tg->td->throtl_slice))
+ throtl_extend_slice(tg, rw,
+ jiffies + tg->td->throtl_slice);
}
if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
@@ -816,6 +1016,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
/* Charge the bio to the group */
tg->bytes_disp[rw] += bio->bi_iter.bi_size;
tg->io_disp[rw]++;
+ tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+ tg->last_io_disp[rw]++;
/*
* BIO_THROTTLED is used to prevent the same bio to be throttled
@@ -999,6 +1201,8 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
return nr_disp;
}
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg);
/**
* throtl_pending_timer_fn - timer function for service_queue->pending_timer
* @arg: the throtl_service_queue being serviced
@@ -1025,6 +1229,9 @@ static void throtl_pending_timer_fn(unsigned long arg)
int ret;
spin_lock_irq(q->queue_lock);
+ if (throtl_can_upgrade(td, NULL))
+ throtl_upgrade_state(td);
+
again:
parent_sq = sq->parent_sq;
dispatched = false;
@@ -1112,7 +1319,7 @@ static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
struct throtl_grp *tg = pd_to_tg(pd);
u64 v = *(u64 *)((void *)tg + off);
- if (v == -1)
+ if (v == U64_MAX)
return 0;
return __blkg_prfill_u64(sf, pd, v);
}
@@ -1123,7 +1330,7 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
struct throtl_grp *tg = pd_to_tg(pd);
unsigned int v = *(unsigned int *)((void *)tg + off);
- if (v == -1)
+ if (v == UINT_MAX)
return 0;
return __blkg_prfill_u64(sf, pd, v);
}
@@ -1150,8 +1357,8 @@ static void tg_conf_updated(struct throtl_grp *tg)
throtl_log(&tg->service_queue,
"limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
- tg->bps[READ], tg->bps[WRITE],
- tg->iops[READ], tg->iops[WRITE]);
+ tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
+ tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
/*
* Update has_rules[] flags for the updated tg's subtree. A tg is
@@ -1197,7 +1404,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
if (sscanf(ctx.body, "%llu", &v) != 1)
goto out_finish;
if (!v)
- v = -1;
+ v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
@@ -1228,25 +1435,25 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
- .private = offsetof(struct throtl_grp, bps[READ]),
+ .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.write_bps_device",
- .private = offsetof(struct throtl_grp, bps[WRITE]),
+ .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
.seq_show = tg_print_conf_u64,
.write = tg_set_conf_u64,
},
{
.name = "throttle.read_iops_device",
- .private = offsetof(struct throtl_grp, iops[READ]),
+ .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
{
.name = "throttle.write_iops_device",
- .private = offsetof(struct throtl_grp, iops[WRITE]),
+ .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
.seq_show = tg_print_conf_uint,
.write = tg_set_conf_uint,
},
@@ -1263,48 +1470,87 @@ static struct cftype throtl_legacy_files[] = {
{ } /* terminate */
};
-static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
struct throtl_grp *tg = pd_to_tg(pd);
const char *dname = blkg_dev_name(pd->blkg);
char bufs[4][21] = { "max", "max", "max", "max" };
+ u64 bps_dft;
+ unsigned int iops_dft;
+ char idle_time[26] = "";
+ char latency_time[26] = "";
if (!dname)
return 0;
- if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
- tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+
+ if (off == LIMIT_LOW) {
+ bps_dft = 0;
+ iops_dft = 0;
+ } else {
+ bps_dft = U64_MAX;
+ iops_dft = UINT_MAX;
+ }
+
+ if (tg->bps_conf[READ][off] == bps_dft &&
+ tg->bps_conf[WRITE][off] == bps_dft &&
+ tg->iops_conf[READ][off] == iops_dft &&
+ tg->iops_conf[WRITE][off] == iops_dft &&
+ (off != LIMIT_LOW ||
+ (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
+ tg->latency_target == DFL_LATENCY_TARGET)))
return 0;
- if (tg->bps[READ] != -1)
- snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
- if (tg->bps[WRITE] != -1)
- snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
- if (tg->iops[READ] != -1)
- snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
- if (tg->iops[WRITE] != -1)
- snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
-
- seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
- dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+ if (tg->bps_conf[READ][off] != bps_dft)
+ snprintf(bufs[0], sizeof(bufs[0]), "%llu",
+ tg->bps_conf[READ][off]);
+ if (tg->bps_conf[WRITE][off] != bps_dft)
+ snprintf(bufs[1], sizeof(bufs[1]), "%llu",
+ tg->bps_conf[WRITE][off]);
+ if (tg->iops_conf[READ][off] != iops_dft)
+ snprintf(bufs[2], sizeof(bufs[2]), "%u",
+ tg->iops_conf[READ][off]);
+ if (tg->iops_conf[WRITE][off] != iops_dft)
+ snprintf(bufs[3], sizeof(bufs[3]), "%u",
+ tg->iops_conf[WRITE][off]);
+ if (off == LIMIT_LOW) {
+ if (tg->idletime_threshold == ULONG_MAX)
+ strcpy(idle_time, " idle=max");
+ else
+ snprintf(idle_time, sizeof(idle_time), " idle=%lu",
+ tg->idletime_threshold);
+
+ if (tg->latency_target == ULONG_MAX)
+ strcpy(latency_time, " latency=max");
+ else
+ snprintf(latency_time, sizeof(latency_time),
+ " latency=%lu", tg->latency_target);
+ }
+
+ seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
+ dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
+ latency_time);
return 0;
}
-static int tg_print_max(struct seq_file *sf, void *v)
+static int tg_print_limit(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
&blkcg_policy_throtl, seq_cft(sf)->private, false);
return 0;
}
-static ssize_t tg_set_max(struct kernfs_open_file *of,
+static ssize_t tg_set_limit(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
struct throtl_grp *tg;
u64 v[4];
+ unsigned long idle_time;
+ unsigned long latency_time;
int ret;
+ int index = of_cft(of)->private;
ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
if (ret)
@@ -1312,15 +1558,17 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
tg = blkg_to_tg(ctx.blkg);
- v[0] = tg->bps[READ];
- v[1] = tg->bps[WRITE];
- v[2] = tg->iops[READ];
- v[3] = tg->iops[WRITE];
+ v[0] = tg->bps_conf[READ][index];
+ v[1] = tg->bps_conf[WRITE][index];
+ v[2] = tg->iops_conf[READ][index];
+ v[3] = tg->iops_conf[WRITE][index];
+ idle_time = tg->idletime_threshold;
+ latency_time = tg->latency_target;
while (true) {
char tok[27]; /* wiops=18446744073709551616 */
char *p;
- u64 val = -1;
+ u64 val = U64_MAX;
int len;
if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
@@ -1348,15 +1596,43 @@ static ssize_t tg_set_max(struct kernfs_open_file *of,
v[2] = min_t(u64, val, UINT_MAX);
else if (!strcmp(tok, "wiops"))
v[3] = min_t(u64, val, UINT_MAX);
+ else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
+ idle_time = val;
+ else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
+ latency_time = val;
else
goto out_finish;
}
- tg->bps[READ] = v[0];
- tg->bps[WRITE] = v[1];
- tg->iops[READ] = v[2];
- tg->iops[WRITE] = v[3];
+ tg->bps_conf[READ][index] = v[0];
+ tg->bps_conf[WRITE][index] = v[1];
+ tg->iops_conf[READ][index] = v[2];
+ tg->iops_conf[WRITE][index] = v[3];
+ if (index == LIMIT_MAX) {
+ tg->bps[READ][index] = v[0];
+ tg->bps[WRITE][index] = v[1];
+ tg->iops[READ][index] = v[2];
+ tg->iops[WRITE][index] = v[3];
+ }
+ tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
+ tg->bps_conf[READ][LIMIT_MAX]);
+ tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
+ tg->bps_conf[WRITE][LIMIT_MAX]);
+ tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
+ tg->iops_conf[READ][LIMIT_MAX]);
+ tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
+ tg->iops_conf[WRITE][LIMIT_MAX]);
+
+ if (index == LIMIT_LOW) {
+ blk_throtl_update_limit_valid(tg->td);
+ if (tg->td->limit_valid[LIMIT_LOW])
+ tg->td->limit_index = LIMIT_LOW;
+ tg->idletime_threshold = (idle_time == ULONG_MAX) ?
+ ULONG_MAX : idle_time;
+ tg->latency_target = (latency_time == ULONG_MAX) ?
+ ULONG_MAX : latency_time;
+ }
tg_conf_updated(tg);
ret = 0;
out_finish:
@@ -1365,11 +1641,21 @@ out_finish:
}
static struct cftype throtl_files[] = {
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ {
+ .name = "low",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = tg_print_limit,
+ .write = tg_set_limit,
+ .private = LIMIT_LOW,
+ },
+#endif
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = tg_print_max,
- .write = tg_set_max,
+ .seq_show = tg_print_limit,
+ .write = tg_set_limit,
+ .private = LIMIT_MAX,
},
{ } /* terminate */
};
@@ -1388,9 +1674,376 @@ static struct blkcg_policy blkcg_policy_throtl = {
.pd_alloc_fn = throtl_pd_alloc,
.pd_init_fn = throtl_pd_init,
.pd_online_fn = throtl_pd_online,
+ .pd_offline_fn = throtl_pd_offline,
.pd_free_fn = throtl_pd_free,
};
+static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ unsigned long rtime = jiffies, wtime = jiffies;
+
+ if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
+ rtime = tg->last_low_overflow_time[READ];
+ if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
+ wtime = tg->last_low_overflow_time[WRITE];
+ return min(rtime, wtime);
+}
+
+/* tg should not be an intermediate node */
+static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *parent_sq;
+ struct throtl_grp *parent = tg;
+ unsigned long ret = __tg_last_low_overflow_time(tg);
+
+ while (true) {
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ if (!parent)
+ break;
+
+ /*
+ * The parent doesn't have low limit, it always reaches low
+ * limit. Its overflow time is useless for children
+ */
+ if (!parent->bps[READ][LIMIT_LOW] &&
+ !parent->iops[READ][LIMIT_LOW] &&
+ !parent->bps[WRITE][LIMIT_LOW] &&
+ !parent->iops[WRITE][LIMIT_LOW])
+ continue;
+ if (time_after(__tg_last_low_overflow_time(parent), ret))
+ ret = __tg_last_low_overflow_time(parent);
+ }
+ return ret;
+}
+
+static bool throtl_tg_is_idle(struct throtl_grp *tg)
+{
+ /*
+ * cgroup is idle if:
+ * - single idle is too long, longer than a fixed value (in case user
+ * configure a too big threshold) or 4 times of slice
+ * - average think time is more than threshold
+ * - IO latency is largely below threshold
+ */
+ unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
+
+ time = min_t(unsigned long, MAX_IDLE_TIME, time);
+ return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+ tg->avg_idletime > tg->idletime_threshold ||
+ (tg->latency_target && tg->bio_cnt &&
+ tg->bad_bio_cnt * 5 < tg->bio_cnt);
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *sq = &tg->service_queue;
+ bool read_limit, write_limit;
+
+ /*
+ * if cgroup reaches low limit (if low limit is 0, the cgroup always
+ * reaches), it's ok to upgrade to next limit
+ */
+ read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
+ write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
+ if (!read_limit && !write_limit)
+ return true;
+ if (read_limit && sq->nr_queued[READ] &&
+ (!write_limit || sq->nr_queued[WRITE]))
+ return true;
+ if (write_limit && sq->nr_queued[WRITE] &&
+ (!read_limit || sq->nr_queued[READ]))
+ return true;
+
+ if (time_after_eq(jiffies,
+ tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
+ throtl_tg_is_idle(tg))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (throtl_tg_can_upgrade(tg))
+ return true;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ return false;
+ }
+ return false;
+}
+
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ if (td->limit_index != LIMIT_LOW)
+ return false;
+
+ if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
+ return false;
+
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ if (tg == this_tg)
+ continue;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ continue;
+ if (!throtl_hierarchy_can_upgrade(tg)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+ return true;
+}
+
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_LOW)
+ return;
+
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ tg->last_check_time = now;
+
+ if (!time_after_eq(now,
+ __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
+ return;
+
+ if (throtl_can_upgrade(tg->td, NULL))
+ throtl_upgrade_state(tg->td);
+}
+
+static void throtl_upgrade_state(struct throtl_data *td)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ td->limit_index = LIMIT_MAX;
+ td->low_upgrade_time = jiffies;
+ td->scale = 0;
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ tg->disptime = jiffies - 1;
+ throtl_select_dispatch(sq);
+ throtl_schedule_next_dispatch(sq, false);
+ }
+ rcu_read_unlock();
+ throtl_select_dispatch(&td->service_queue);
+ throtl_schedule_next_dispatch(&td->service_queue, false);
+ queue_work(kthrotld_workqueue, &td->dispatch_work);
+}
+
+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+ td->scale /= 2;
+
+ if (td->scale) {
+ td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
+ return;
+ }
+
+ td->limit_index = new;
+ td->low_downgrade_time = jiffies;
+}
+
+static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
+{
+ struct throtl_data *td = tg->td;
+ unsigned long now = jiffies;
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
+ time_after_eq(now, tg_last_low_overflow_time(tg) +
+ td->throtl_slice) &&
+ (!throtl_tg_is_idle(tg) ||
+ !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
+ return true;
+ return false;
+}
+
+static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
+{
+ while (true) {
+ if (!throtl_tg_can_downgrade(tg))
+ return false;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ if (!tg || !tg_to_blkg(tg)->parent)
+ break;
+ }
+ return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+ uint64_t bps;
+ unsigned int iops;
+ unsigned long elapsed_time;
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_MAX ||
+ !tg->td->limit_valid[LIMIT_LOW])
+ return;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ return;
+ if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
+ return;
+
+ elapsed_time = now - tg->last_check_time;
+ tg->last_check_time = now;
+
+ if (time_before(now, tg_last_low_overflow_time(tg) +
+ tg->td->throtl_slice))
+ return;
+
+ if (tg->bps[READ][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[READ] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->bps[WRITE][LIMIT_LOW]) {
+ bps = tg->last_bytes_disp[WRITE] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ if (tg->iops[READ][LIMIT_LOW]) {
+ iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+ if (iops >= tg->iops[READ][LIMIT_LOW])
+ tg->last_low_overflow_time[READ] = now;
+ }
+
+ if (tg->iops[WRITE][LIMIT_LOW]) {
+ iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+ if (iops >= tg->iops[WRITE][LIMIT_LOW])
+ tg->last_low_overflow_time[WRITE] = now;
+ }
+
+ /*
+ * If cgroup is below low limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (throtl_hierarchy_can_downgrade(tg))
+ throtl_downgrade_state(tg->td, LIMIT_LOW);
+
+ tg->last_bytes_disp[READ] = 0;
+ tg->last_bytes_disp[WRITE] = 0;
+ tg->last_io_disp[READ] = 0;
+ tg->last_io_disp[WRITE] = 0;
+}
+
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+ unsigned long now = ktime_get_ns() >> 10;
+ unsigned long last_finish_time = tg->last_finish_time;
+
+ if (now <= last_finish_time || last_finish_time == 0 ||
+ last_finish_time == tg->checked_last_finish_time)
+ return;
+
+ tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
+ tg->checked_last_finish_time = last_finish_time;
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_update_latency_buckets(struct throtl_data *td)
+{
+ struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
+ int i, cpu;
+ unsigned long last_latency = 0;
+ unsigned long latency;
+
+ if (!blk_queue_nonrot(td->queue))
+ return;
+ if (time_before(jiffies, td->last_calculate_time + HZ))
+ return;
+ td->last_calculate_time = jiffies;
+
+ memset(avg_latency, 0, sizeof(avg_latency));
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ /* this isn't race free, but ok in practice */
+ bucket = per_cpu_ptr(td->latency_buckets, cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].samples = 0;
+ }
+
+ if (tmp->samples >= 32) {
+ int samples = tmp->samples;
+
+ latency = tmp->total_latency;
+
+ tmp->total_latency = 0;
+ tmp->samples = 0;
+ latency /= samples;
+ if (latency == 0)
+ continue;
+ avg_latency[i].latency = latency;
+ }
+ }
+
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!avg_latency[i].latency) {
+ if (td->avg_buckets[i].latency < last_latency)
+ td->avg_buckets[i].latency = last_latency;
+ continue;
+ }
+
+ if (!td->avg_buckets[i].valid)
+ latency = avg_latency[i].latency;
+ else
+ latency = (td->avg_buckets[i].latency * 7 +
+ avg_latency[i].latency) >> 3;
+
+ td->avg_buckets[i].latency = max(latency, last_latency);
+ td->avg_buckets[i].valid = true;
+ last_latency = td->avg_buckets[i].latency;
+ }
+}
+#else
+static inline void throtl_update_latency_buckets(struct throtl_data *td)
+{
+}
+#endif
+
+static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+{
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ int ret;
+
+ ret = bio_associate_current(bio);
+ if (ret == 0 || ret == -EBUSY)
+ bio->bi_cg_private = tg;
+ blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio));
+#else
+ bio_associate_current(bio);
+#endif
+}
+
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio)
{
@@ -1399,6 +2052,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
bool throttled = false;
+ struct throtl_data *td = tg->td;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -1408,19 +2062,35 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
spin_lock_irq(q->queue_lock);
+ throtl_update_latency_buckets(td);
+
if (unlikely(blk_queue_bypass(q)))
goto out_unlock;
+ blk_throtl_assoc_bio(tg, bio);
+ blk_throtl_update_idletime(tg);
+
sq = &tg->service_queue;
+again:
while (true) {
+ if (tg->last_low_overflow_time[rw] == 0)
+ tg->last_low_overflow_time[rw] = jiffies;
+ throtl_downgrade_check(tg);
+ throtl_upgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;
/* if above limits, break to queue */
- if (!tg_may_dispatch(tg, bio, NULL))
+ if (!tg_may_dispatch(tg, bio, NULL)) {
+ tg->last_low_overflow_time[rw] = jiffies;
+ if (throtl_can_upgrade(td, tg)) {
+ throtl_upgrade_state(td);
+ goto again;
+ }
break;
+ }
/* within limits, let's charge and dispatch directly */
throtl_charge_bio(tg, bio);
@@ -1453,12 +2123,14 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
/* out-of-limit, queue to @tg */
throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
rw == READ ? 'R' : 'W',
- tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw],
- tg->io_disp[rw], tg->iops[rw],
+ tg->bytes_disp[rw], bio->bi_iter.bi_size,
+ tg_bps_limit(tg, rw),
+ tg->io_disp[rw], tg_iops_limit(tg, rw),
sq->nr_queued[READ], sq->nr_queued[WRITE]);
- bio_associate_current(bio);
- tg->td->nr_queued[rw]++;
+ tg->last_low_overflow_time[rw] = jiffies;
+
+ td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
throttled = true;
@@ -1483,9 +2155,94 @@ out:
*/
if (!throttled)
bio_clear_flag(bio, BIO_THROTTLED);
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ if (throttled || !td->track_bio_latency)
+ bio->bi_issue_stat.stat |= SKIP_LATENCY;
+#endif
return throttled;
}
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+static void throtl_track_latency(struct throtl_data *td, sector_t size,
+ int op, unsigned long time)
+{
+ struct latency_bucket *latency;
+ int index;
+
+ if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+ !blk_queue_nonrot(td->queue))
+ return;
+
+ index = request_bucket_index(size);
+
+ latency = get_cpu_ptr(td->latency_buckets);
+ latency[index].total_latency += time;
+ latency[index].samples++;
+ put_cpu_ptr(td->latency_buckets);
+}
+
+void blk_throtl_stat_add(struct request *rq, u64 time_ns)
+{
+ struct request_queue *q = rq->q;
+ struct throtl_data *td = q->td;
+
+ throtl_track_latency(td, blk_stat_size(&rq->issue_stat),
+ req_op(rq), time_ns >> 10);
+}
+
+void blk_throtl_bio_endio(struct bio *bio)
+{
+ struct throtl_grp *tg;
+ u64 finish_time_ns;
+ unsigned long finish_time;
+ unsigned long start_time;
+ unsigned long lat;
+
+ tg = bio->bi_cg_private;
+ if (!tg)
+ return;
+ bio->bi_cg_private = NULL;
+
+ finish_time_ns = ktime_get_ns();
+ tg->last_finish_time = finish_time_ns >> 10;
+
+ start_time = blk_stat_time(&bio->bi_issue_stat) >> 10;
+ finish_time = __blk_stat_time(finish_time_ns) >> 10;
+ if (!start_time || finish_time <= start_time)
+ return;
+
+ lat = finish_time - start_time;
+ /* this is only for bio based driver */
+ if (!(bio->bi_issue_stat.stat & SKIP_LATENCY))
+ throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat),
+ bio_op(bio), lat);
+
+ if (tg->latency_target) {
+ int bucket;
+ unsigned int threshold;
+
+ bucket = request_bucket_index(
+ blk_stat_size(&bio->bi_issue_stat));
+ threshold = tg->td->avg_buckets[bucket].latency +
+ tg->latency_target;
+ if (lat > threshold)
+ tg->bad_bio_cnt++;
+ /*
+ * Not race free, could get wrong count, which means cgroups
+ * will be throttled
+ */
+ tg->bio_cnt++;
+ }
+
+ if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
+ tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
+ tg->bio_cnt /= 2;
+ tg->bad_bio_cnt /= 2;
+ }
+}
+#endif
+
/*
* Dispatch all bios from all children tg's queued on @parent_sq. On
* return, @parent_sq is guaranteed to not have any active children tg's
@@ -1558,6 +2315,12 @@ int blk_throtl_init(struct request_queue *q)
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
+ td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+ LATENCY_BUCKET_SIZE, __alignof__(u64));
+ if (!td->latency_buckets) {
+ kfree(td);
+ return -ENOMEM;
+ }
INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
throtl_service_queue_init(&td->service_queue);
@@ -1565,10 +2328,17 @@ int blk_throtl_init(struct request_queue *q)
q->td = td;
td->queue = q;
+ td->limit_valid[LIMIT_MAX] = true;
+ td->limit_index = LIMIT_MAX;
+ td->low_upgrade_time = jiffies;
+ td->low_downgrade_time = jiffies;
+
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
- if (ret)
+ if (ret) {
+ free_percpu(td->latency_buckets);
kfree(td);
+ }
return ret;
}
@@ -1577,9 +2347,74 @@ void blk_throtl_exit(struct request_queue *q)
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+ free_percpu(q->td->latency_buckets);
kfree(q->td);
}
+void blk_throtl_register_queue(struct request_queue *q)
+{
+ struct throtl_data *td;
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ td = q->td;
+ BUG_ON(!td);
+
+ if (blk_queue_nonrot(q)) {
+ td->throtl_slice = DFL_THROTL_SLICE_SSD;
+ td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
+ } else {
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+ td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
+ }
+#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
+ /* if no low limit, use previous default */
+ td->throtl_slice = DFL_THROTL_SLICE_HD;
+#endif
+
+ td->track_bio_latency = !q->mq_ops && !q->request_fn;
+ if (!td->track_bio_latency)
+ blk_stat_enable_accounting(q);
+
+ /*
+ * some tg are created before queue is fully initialized, eg, nonrot
+ * isn't initialized yet
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+
+ tg->idletime_threshold = td->dft_idletime_threshold;
+ }
+ rcu_read_unlock();
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
+{
+ if (!q->td)
+ return -EINVAL;
+ return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
+}
+
+ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long v;
+ unsigned long t;
+
+ if (!q->td)
+ return -EINVAL;
+ if (kstrtoul(page, 10, &v))
+ return -EINVAL;
+ t = msecs_to_jiffies(v);
+ if (t == 0 || t > MAX_THROTL_SLICE)
+ return -EINVAL;
+ q->td->throtl_slice = t;
+ return count;
+}
+#endif
+
static int __init throtl_init(void)
{
kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index a30441a200c09..cbff183f3d9f9 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -89,7 +89,6 @@ static void blk_rq_timed_out(struct request *req)
ret = q->rq_timed_out_fn(req);
switch (ret) {
case BLK_EH_HANDLED:
- /* Can we use req->errors here? */
__blk_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 1aedb1f7ee0c7..17676f4d7fd15 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -255,8 +255,8 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
* that it's writes impacting us, and not just some sole read on
* a device that is in a lower power state.
*/
- return stat[BLK_STAT_READ].nr_samples >= 1 &&
- stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+ return (stat[READ].nr_samples >= 1 &&
+ stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
}
static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
@@ -277,7 +277,7 @@ enum {
LAT_EXCEEDED,
};
-static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
u64 thislat;
@@ -293,7 +293,7 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
*/
thislat = rwb_sync_issue_lat(rwb);
if (thislat > rwb->cur_win_nsec ||
- (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) {
+ (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
trace_wbt_lat(bdi, thislat);
return LAT_EXCEEDED;
}
@@ -308,8 +308,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
* waited or still has writes in flights, consider us doing
* just writes as well.
*/
- if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) ||
- wb_recent_wait(rwb) || wbt_inflight(rwb))
+ if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
+ wbt_inflight(rwb))
return LAT_UNKNOWN_WRITES;
return LAT_UNKNOWN;
}
@@ -317,8 +317,8 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
/*
* If the 'min' latency exceeds our target, step down.
*/
- if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) {
- trace_wbt_lat(bdi, stat[BLK_STAT_READ].min);
+ if (stat[READ].min > rwb->min_lat_nsec) {
+ trace_wbt_lat(bdi, stat[READ].min);
trace_wbt_stat(bdi, stat);
return LAT_EXCEEDED;
}
@@ -329,14 +329,6 @@ static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
return LAT_OK;
}
-static int latency_exceeded(struct rq_wb *rwb)
-{
- struct blk_rq_stat stat[2];
-
- blk_queue_stat_get(rwb->queue, stat);
- return __latency_exceeded(rwb, stat);
-}
-
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
struct backing_dev_info *bdi = rwb->queue->backing_dev_info;
@@ -355,7 +347,6 @@ static void scale_up(struct rq_wb *rwb)
rwb->scale_step--;
rwb->unknown_cnt = 0;
- blk_stat_clear(rwb->queue);
rwb->scaled_max = calc_wb_limits(rwb);
@@ -385,15 +376,12 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
rwb->scaled_max = false;
rwb->unknown_cnt = 0;
- blk_stat_clear(rwb->queue);
calc_wb_limits(rwb);
rwb_trace_step(rwb, "step down");
}
static void rwb_arm_timer(struct rq_wb *rwb)
{
- unsigned long expires;
-
if (rwb->scale_step > 0) {
/*
* We should speed this up, using some variant of a fast
@@ -411,17 +399,16 @@ static void rwb_arm_timer(struct rq_wb *rwb)
rwb->cur_win_nsec = rwb->win_nsec;
}
- expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
- mod_timer(&rwb->window_timer, expires);
+ blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
}
-static void wb_timer_fn(unsigned long data)
+static void wb_timer_fn(struct blk_stat_callback *cb)
{
- struct rq_wb *rwb = (struct rq_wb *) data;
+ struct rq_wb *rwb = cb->data;
unsigned int inflight = wbt_inflight(rwb);
int status;
- status = latency_exceeded(rwb);
+ status = latency_exceeded(rwb, cb->stat);
trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step,
inflight);
@@ -614,7 +601,7 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
__wbt_wait(rwb, bio->bi_opf, lock);
- if (!timer_pending(&rwb->window_timer))
+ if (!blk_stat_is_active(rwb->cb))
rwb_arm_timer(rwb);
if (current_is_kswapd())
@@ -666,22 +653,37 @@ void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
rwb->wc = write_cache_on;
}
- /*
- * Disable wbt, if enabled by default. Only called from CFQ, if we have
- * cgroups enabled
+/*
+ * Disable wbt, if enabled by default. Only called from CFQ.
*/
void wbt_disable_default(struct request_queue *q)
{
struct rq_wb *rwb = q->rq_wb;
- if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) {
- del_timer_sync(&rwb->window_timer);
- rwb->win_nsec = rwb->min_lat_nsec = 0;
- wbt_update_limits(rwb);
- }
+ if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT)
+ wbt_exit(q);
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
+/*
+ * Enable wbt if defaults are configured that way
+ */
+void wbt_enable_default(struct request_queue *q)
+{
+ /* Throttling already enabled? */
+ if (q->rq_wb)
+ return;
+
+ /* Queue not registered? Maybe shutting down... */
+ if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ return;
+
+ if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
+ (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
+ wbt_init(q);
+}
+EXPORT_SYMBOL_GPL(wbt_enable_default);
+
u64 wbt_default_latency_nsec(struct request_queue *q)
{
/*
@@ -694,29 +696,33 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
return 75000000ULL;
}
+static int wbt_data_dir(const struct request *rq)
+{
+ return rq_data_dir(rq);
+}
+
int wbt_init(struct request_queue *q)
{
struct rq_wb *rwb;
int i;
- /*
- * For now, we depend on the stats window being larger than
- * our monitoring window. Ensure that this isn't inadvertently
- * violated.
- */
- BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC);
BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS);
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
return -ENOMEM;
+ rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
+ if (!rwb->cb) {
+ kfree(rwb);
+ return -ENOMEM;
+ }
+
for (i = 0; i < WBT_NUM_RWQ; i++) {
atomic_set(&rwb->rq_wait[i].inflight, 0);
init_waitqueue_head(&rwb->rq_wait[i].wait);
}
- setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
rwb->wc = 1;
rwb->queue_depth = RWB_DEF_DEPTH;
rwb->last_comp = rwb->last_issue = jiffies;
@@ -726,10 +732,10 @@ int wbt_init(struct request_queue *q)
wbt_update_limits(rwb);
/*
- * Assign rwb, and turn on stats tracking for this queue
+ * Assign rwb and add the stats callback.
*/
q->rq_wb = rwb;
- blk_stat_enable(q);
+ blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
@@ -744,7 +750,8 @@ void wbt_exit(struct request_queue *q)
struct rq_wb *rwb = q->rq_wb;
if (rwb) {
- del_timer_sync(&rwb->window_timer);
+ blk_stat_remove_callback(q, rwb->cb);
+ blk_stat_free_callback(rwb->cb);
q->rq_wb = NULL;
kfree(rwb);
}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 65f1de519f67e..df6de50c5d594 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -32,27 +32,27 @@ enum {
static inline void wbt_clear_state(struct blk_issue_stat *stat)
{
- stat->time &= BLK_STAT_TIME_MASK;
+ stat->stat &= ~BLK_STAT_RES_MASK;
}
static inline enum wbt_flags wbt_stat_to_mask(struct blk_issue_stat *stat)
{
- return (stat->time & BLK_STAT_MASK) >> BLK_STAT_SHIFT;
+ return (stat->stat & BLK_STAT_RES_MASK) >> BLK_STAT_RES_SHIFT;
}
static inline void wbt_track(struct blk_issue_stat *stat, enum wbt_flags wb_acct)
{
- stat->time |= ((u64) wb_acct) << BLK_STAT_SHIFT;
+ stat->stat |= ((u64) wb_acct) << BLK_STAT_RES_SHIFT;
}
static inline bool wbt_is_tracked(struct blk_issue_stat *stat)
{
- return (stat->time >> BLK_STAT_SHIFT) & WBT_TRACKED;
+ return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_TRACKED;
}
static inline bool wbt_is_read(struct blk_issue_stat *stat)
{
- return (stat->time >> BLK_STAT_SHIFT) & WBT_READ;
+ return (stat->stat >> BLK_STAT_RES_SHIFT) & WBT_READ;
}
struct rq_wait {
@@ -81,7 +81,7 @@ struct rq_wb {
u64 win_nsec; /* default window size */
u64 cur_win_nsec; /* current window size */
- struct timer_list window_timer;
+ struct blk_stat_callback *cb;
s64 sync_issue;
void *sync_cookie;
@@ -117,6 +117,7 @@ void wbt_update_limits(struct rq_wb *);
void wbt_requeue(struct rq_wb *, struct blk_issue_stat *);
void wbt_issue(struct rq_wb *, struct blk_issue_stat *);
void wbt_disable_default(struct request_queue *);
+void wbt_enable_default(struct request_queue *);
void wbt_set_queue_depth(struct rq_wb *, unsigned int);
void wbt_set_write_cache(struct rq_wb *, bool);
@@ -155,6 +156,9 @@ static inline void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat)
static inline void wbt_disable_default(struct request_queue *q)
{
}
+static inline void wbt_enable_default(struct request_queue *q)
+{
+}
static inline void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
{
}
diff --git a/block/blk.h b/block/blk.h
index d1ea4bd9b9a3f..2ed70228e44fc 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -60,15 +60,12 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
int blk_init_rl(struct request_list *rl, struct request_queue *q,
gfp_t gfp_mask);
void blk_exit_rl(struct request_list *rl);
-void init_request_from_bio(struct request *req, struct bio *bio);
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio);
void blk_queue_bypass_start(struct request_queue *q);
void blk_queue_bypass_end(struct request_queue *q);
void blk_dequeue_request(struct request *rq);
void __blk_queue_free_tags(struct request_queue *q);
-bool __blk_end_bidi_request(struct request *rq, int error,
- unsigned int nr_bytes, unsigned int bidi_bytes);
void blk_freeze_queue(struct request_queue *q);
static inline void blk_queue_enter_live(struct request_queue *q)
@@ -319,10 +316,22 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
+extern void blk_throtl_register_queue(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
+static inline void blk_throtl_register_queue(struct request_queue *q) { }
#endif /* CONFIG_BLK_DEV_THROTTLING */
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
+extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
+ const char *page, size_t count);
+extern void blk_throtl_bio_endio(struct bio *bio);
+extern void blk_throtl_stat_add(struct request *rq, u64 time);
+#else
+static inline void blk_throtl_bio_endio(struct bio *bio) { }
+static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
+#endif
#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index cd15f9dbb1474..0a23dbba2d301 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -37,7 +37,7 @@ static void bsg_destroy_job(struct kref *kref)
struct bsg_job *job = container_of(kref, struct bsg_job, kref);
struct request *rq = job->req;
- blk_end_request_all(rq, rq->errors);
+ blk_end_request_all(rq, scsi_req(rq)->result);
put_device(job->dev); /* release reference for the request */
@@ -74,7 +74,7 @@ void bsg_job_done(struct bsg_job *job, int result,
struct scsi_request *rq = scsi_req(req);
int err;
- err = job->req->errors = result;
+ err = scsi_req(job->req)->result = result;
if (err < 0)
/* we're only returning the result field in the reply */
rq->sense_len = sizeof(u32);
@@ -177,7 +177,7 @@ failjob_rls_job:
* @q: request queue to manage
*
* On error the create_bsg_job function should return a -Exyz error value
- * that will be set to the req->errors.
+ * that will be set to ->result.
*
* Drivers/subsys should pass this to the queue init function.
*/
@@ -201,7 +201,7 @@ static void bsg_request_fn(struct request_queue *q)
ret = bsg_create_job(dev, req);
if (ret) {
- req->errors = ret;
+ scsi_req(req)->result = ret;
blk_end_request_all(req, ret);
spin_lock_irq(q->queue_lock);
continue;
diff --git a/block/bsg.c b/block/bsg.c
index 74835dbf0c47c..d9da1b613cedf 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -391,13 +391,13 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
struct scsi_request *req = scsi_req(rq);
int ret = 0;
- dprintk("rq %p bio %p 0x%x\n", rq, bio, rq->errors);
+ dprintk("rq %p bio %p 0x%x\n", rq, bio, req->result);
/*
* fill in all the output members
*/
- hdr->device_status = rq->errors & 0xff;
- hdr->transport_status = host_byte(rq->errors);
- hdr->driver_status = driver_byte(rq->errors);
+ hdr->device_status = req->result & 0xff;
+ hdr->transport_status = host_byte(req->result);
+ hdr->driver_status = driver_byte(req->result);
hdr->info = 0;
if (hdr->device_status || hdr->transport_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
@@ -431,8 +431,8 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
* just a protocol response (i.e. non negative), that gets
* processed above.
*/
- if (!ret && rq->errors < 0)
- ret = rq->errors;
+ if (!ret && req->result < 0)
+ ret = req->result;
blk_rq_unmap_user(bio);
scsi_req_free_cmd(req);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 440b95ee593c9..da69b079725fb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3761,16 +3761,14 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
uint64_t serial_nr;
- bool nonroot_cg;
rcu_read_lock();
serial_nr = bio_blkcg(bio)->css.serial_nr;
- nonroot_cg = bio_blkcg(bio) != &blkcg_root;
rcu_read_unlock();
/*
@@ -3778,7 +3776,7 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
* spuriously on a newly created cic but there's no harm.
*/
if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
- return nonroot_cg;
+ return;
/*
* Drop reference to queues. New queues will be assigned in new
@@ -3799,12 +3797,10 @@ static bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
}
cic->blkcg_serial_nr = serial_nr;
- return nonroot_cg;
}
#else
-static inline bool check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
+static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
{
- return false;
}
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
@@ -4449,12 +4445,11 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
const int rw = rq_data_dir(rq);
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
- bool disable_wbt;
spin_lock_irq(q->queue_lock);
check_ioprio_changed(cic, bio);
- disable_wbt = check_blkcg_changed(cic, bio);
+ check_blkcg_changed(cic, bio);
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
@@ -4491,9 +4486,6 @@ new_queue:
rq->elv.priv[1] = cfqq->cfqg;
spin_unlock_irq(q->queue_lock);
- if (disable_wbt)
- wbt_disable_default(q);
-
return 0;
}
@@ -4706,6 +4698,7 @@ static void cfq_registered_queue(struct request_queue *q)
*/
if (blk_queue_nonrot(q))
cfqd->cfq_slice_idle = 0;
+ wbt_disable_default(q);
}
/*
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 570021a0dc1ca..04325b81c2b41 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -685,7 +685,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKALIGNOFF:
return compat_put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
- return compat_put_uint(arg, bdev_discard_zeroes_data(bdev));
+ return compat_put_uint(arg, 0);
case BLKFLSBUF:
case BLKROSET:
case BLKDISCARD:
diff --git a/block/elevator.c b/block/elevator.c
index 4d9084a14c109..bf11e70f008b1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -41,6 +41,7 @@
#include "blk.h"
#include "blk-mq-sched.h"
+#include "blk-wbt.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -877,6 +878,8 @@ void elv_unregister_queue(struct request_queue *q)
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
e->registered = 0;
+ /* Re-enable throttling in case elevator disabled it */
+ wbt_enable_default(q);
}
}
EXPORT_SYMBOL(elv_unregister_queue);
diff --git a/block/genhd.c b/block/genhd.c
index a9c516a8b37db..9a2d01abfa3b4 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1060,8 +1060,19 @@ static struct attribute *disk_attrs[] = {
NULL
};
+static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+ struct device *dev = container_of(kobj, typeof(*dev), kobj);
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (a == &dev_attr_badblocks.attr && !disk->bb)
+ return 0;
+ return a->mode;
+}
+
static struct attribute_group disk_attr_group = {
.attrs = disk_attrs,
+ .is_visible = disk_visible,
};
static const struct attribute_group *disk_attr_groups[] = {
@@ -1352,7 +1363,7 @@ struct kobject *get_disk(struct gendisk *disk)
owner = disk->fops->owner;
if (owner && !try_module_get(owner))
return NULL;
- kobj = kobject_get(&disk_to_dev(disk)->kobj);
+ kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
if (kobj == NULL) {
module_put(owner);
return NULL;
diff --git a/block/ioctl.c b/block/ioctl.c
index 7b88820b93d9d..0de02ee67eed8 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -255,7 +255,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
truncate_inode_pages_range(mapping, start, end);
return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
- false);
+ BLKDEV_ZERO_NOUNMAP);
}
static int put_ushort(unsigned long arg, unsigned short val)
@@ -547,7 +547,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKALIGNOFF:
return put_int(arg, bdev_alignment_offset(bdev));
case BLKDISCARDZEROES:
- return put_uint(arg, bdev_discard_zeroes_data(bdev));
+ return put_uint(arg, 0);
case BLKSECTGET:
max_sectors = min_t(unsigned int, USHRT_MAX,
queue_max_sectors(bdev_get_queue(bdev)));
diff --git a/block/ioprio.c b/block/ioprio.c
index 0c47a00f92a85..4b120c9cf7e8b 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -163,22 +163,12 @@ out:
int ioprio_best(unsigned short aprio, unsigned short bprio)
{
- unsigned short aclass;
- unsigned short bclass;
-
if (!ioprio_valid(aprio))
aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
if (!ioprio_valid(bprio))
bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
- aclass = IOPRIO_PRIO_CLASS(aprio);
- bclass = IOPRIO_PRIO_CLASS(bprio);
- if (aclass == bclass)
- return min(aprio, bprio);
- if (aclass > bclass)
- return bprio;
- else
- return aprio;
+ return min(aprio, bprio);
}
SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
new file mode 100644
index 0000000000000..3b0090bc5dd1b
--- /dev/null
+++ b/block/kyber-iosched.c
@@ -0,0 +1,719 @@
+/*
+ * The Kyber I/O scheduler. Controls latency by throttling queue depths using
+ * scalable techniques.
+ *
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/module.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-stat.h"
+
+/* Scheduling domains. */
+enum {
+ KYBER_READ,
+ KYBER_SYNC_WRITE,
+ KYBER_OTHER, /* Async writes, discard, etc. */
+ KYBER_NUM_DOMAINS,
+};
+
+enum {
+ KYBER_MIN_DEPTH = 256,
+
+ /*
+ * In order to prevent starvation of synchronous requests by a flood of
+ * asynchronous requests, we reserve 25% of requests for synchronous
+ * operations.
+ */
+ KYBER_ASYNC_PERCENT = 75,
+};
+
+/*
+ * Initial device-wide depths for each scheduling domain.
+ *
+ * Even for fast devices with lots of tags like NVMe, you can saturate
+ * the device with only a fraction of the maximum possible queue depth.
+ * So, we cap these to a reasonable value.
+ */
+static const unsigned int kyber_depth[] = {
+ [KYBER_READ] = 256,
+ [KYBER_SYNC_WRITE] = 128,
+ [KYBER_OTHER] = 64,
+};
+
+/*
+ * Scheduling domain batch sizes. We favor reads.
+ */
+static const unsigned int kyber_batch_size[] = {
+ [KYBER_READ] = 16,
+ [KYBER_SYNC_WRITE] = 8,
+ [KYBER_OTHER] = 8,
+};
+
+struct kyber_queue_data {
+ struct request_queue *q;
+
+ struct blk_stat_callback *cb;
+
+ /*
+ * The device is divided into multiple scheduling domains based on the
+ * request type. Each domain has a fixed number of in-flight requests of
+ * that type device-wide, limited by these tokens.
+ */
+ struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
+
+ /*
+ * Async request percentage, converted to per-word depth for
+ * sbitmap_get_shallow().
+ */
+ unsigned int async_depth;
+
+ /* Target latencies in nanoseconds. */
+ u64 read_lat_nsec, write_lat_nsec;
+};
+
+struct kyber_hctx_data {
+ spinlock_t lock;
+ struct list_head rqs[KYBER_NUM_DOMAINS];
+ unsigned int cur_domain;
+ unsigned int batching;
+ wait_queue_t domain_wait[KYBER_NUM_DOMAINS];
+ atomic_t wait_index[KYBER_NUM_DOMAINS];
+};
+
+static int rq_sched_domain(const struct request *rq)
+{
+ unsigned int op = rq->cmd_flags;
+
+ if ((op & REQ_OP_MASK) == REQ_OP_READ)
+ return KYBER_READ;
+ else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op))
+ return KYBER_SYNC_WRITE;
+ else
+ return KYBER_OTHER;
+}
+
+enum {
+ NONE = 0,
+ GOOD = 1,
+ GREAT = 2,
+ BAD = -1,
+ AWFUL = -2,
+};
+
+#define IS_GOOD(status) ((status) > 0)
+#define IS_BAD(status) ((status) < 0)
+
+static int kyber_lat_status(struct blk_stat_callback *cb,
+ unsigned int sched_domain, u64 target)
+{
+ u64 latency;
+
+ if (!cb->stat[sched_domain].nr_samples)
+ return NONE;
+
+ latency = cb->stat[sched_domain].mean;
+ if (latency >= 2 * target)
+ return AWFUL;
+ else if (latency > target)
+ return BAD;
+ else if (latency <= target / 2)
+ return GREAT;
+ else /* (latency <= target) */
+ return GOOD;
+}
+
+/*
+ * Adjust the read or synchronous write depth given the status of reads and
+ * writes. The goal is that the latencies of the two domains are fair (i.e., if
+ * one is good, then the other is good).
+ */
+static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd,
+ unsigned int sched_domain, int this_status,
+ int other_status)
+{
+ unsigned int orig_depth, depth;
+
+ /*
+ * If this domain had no samples, or reads and writes are both good or
+ * both bad, don't adjust the depth.
+ */
+ if (this_status == NONE ||
+ (IS_GOOD(this_status) && IS_GOOD(other_status)) ||
+ (IS_BAD(this_status) && IS_BAD(other_status)))
+ return;
+
+ orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth;
+
+ if (other_status == NONE) {
+ depth++;
+ } else {
+ switch (this_status) {
+ case GOOD:
+ if (other_status == AWFUL)
+ depth -= max(depth / 4, 1U);
+ else
+ depth -= max(depth / 8, 1U);
+ break;
+ case GREAT:
+ if (other_status == AWFUL)
+ depth /= 2;
+ else
+ depth -= max(depth / 4, 1U);
+ break;
+ case BAD:
+ depth++;
+ break;
+ case AWFUL:
+ if (other_status == GREAT)
+ depth += 2;
+ else
+ depth++;
+ break;
+ }
+ }
+
+ depth = clamp(depth, 1U, kyber_depth[sched_domain]);
+ if (depth != orig_depth)
+ sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
+}
+
+/*
+ * Adjust the depth of other requests given the status of reads and synchronous
+ * writes. As long as either domain is doing fine, we don't throttle, but if
+ * both domains are doing badly, we throttle heavily.
+ */
+static void kyber_adjust_other_depth(struct kyber_queue_data *kqd,
+ int read_status, int write_status,
+ bool have_samples)
+{
+ unsigned int orig_depth, depth;
+ int status;
+
+ orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth;
+
+ if (read_status == NONE && write_status == NONE) {
+ depth += 2;
+ } else if (have_samples) {
+ if (read_status == NONE)
+ status = write_status;
+ else if (write_status == NONE)
+ status = read_status;
+ else
+ status = max(read_status, write_status);
+ switch (status) {
+ case GREAT:
+ depth += 2;
+ break;
+ case GOOD:
+ depth++;
+ break;
+ case BAD:
+ depth -= max(depth / 4, 1U);
+ break;
+ case AWFUL:
+ depth /= 2;
+ break;
+ }
+ }
+
+ depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]);
+ if (depth != orig_depth)
+ sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth);
+}
+
+/*
+ * Apply heuristics for limiting queue depths based on gathered latency
+ * statistics.
+ */
+static void kyber_stat_timer_fn(struct blk_stat_callback *cb)
+{
+ struct kyber_queue_data *kqd = cb->data;
+ int read_status, write_status;
+
+ read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec);
+ write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec);
+
+ kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status);
+ kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status);
+ kyber_adjust_other_depth(kqd, read_status, write_status,
+ cb->stat[KYBER_OTHER].nr_samples != 0);
+
+ /*
+ * Continue monitoring latencies if we aren't hitting the targets or
+ * we're still throttling other requests.
+ */
+ if (!blk_stat_is_active(kqd->cb) &&
+ ((IS_BAD(read_status) || IS_BAD(write_status) ||
+ kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER])))
+ blk_stat_activate_msecs(kqd->cb, 100);
+}
+
+static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
+{
+ /*
+ * All of the hardware queues have the same depth, so we can just grab
+ * the shift of the first one.
+ */
+ return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
+}
+
+static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
+{
+ struct kyber_queue_data *kqd;
+ unsigned int max_tokens;
+ unsigned int shift;
+ int ret = -ENOMEM;
+ int i;
+
+ kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
+ if (!kqd)
+ goto err;
+ kqd->q = q;
+
+ kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, rq_sched_domain,
+ KYBER_NUM_DOMAINS, kqd);
+ if (!kqd->cb)
+ goto err_kqd;
+
+ /*
+ * The maximum number of tokens for any scheduling domain is at least
+ * the queue depth of a single hardware queue. If the hardware doesn't
+ * have many tags, still provide a reasonable number.
+ */
+ max_tokens = max_t(unsigned int, q->tag_set->queue_depth,
+ KYBER_MIN_DEPTH);
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ WARN_ON(!kyber_depth[i]);
+ WARN_ON(!kyber_batch_size[i]);
+ ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
+ max_tokens, -1, false, GFP_KERNEL,
+ q->node);
+ if (ret) {
+ while (--i >= 0)
+ sbitmap_queue_free(&kqd->domain_tokens[i]);
+ goto err_cb;
+ }
+ sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]);
+ }
+
+ shift = kyber_sched_tags_shift(kqd);
+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+ kqd->read_lat_nsec = 2000000ULL;
+ kqd->write_lat_nsec = 10000000ULL;
+
+ return kqd;
+
+err_cb:
+ blk_stat_free_callback(kqd->cb);
+err_kqd:
+ kfree(kqd);
+err:
+ return ERR_PTR(ret);
+}
+
+static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+ struct kyber_queue_data *kqd;
+ struct elevator_queue *eq;
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return -ENOMEM;
+
+ kqd = kyber_queue_data_alloc(q);
+ if (IS_ERR(kqd)) {
+ kobject_put(&eq->kobj);
+ return PTR_ERR(kqd);
+ }
+
+ eq->elevator_data = kqd;
+ q->elevator = eq;
+
+ blk_stat_add_callback(q, kqd->cb);
+
+ return 0;
+}
+
+static void kyber_exit_sched(struct elevator_queue *e)
+{
+ struct kyber_queue_data *kqd = e->elevator_data;
+ struct request_queue *q = kqd->q;
+ int i;
+
+ blk_stat_remove_callback(q, kqd->cb);
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++)
+ sbitmap_queue_free(&kqd->domain_tokens[i]);
+ blk_stat_free_callback(kqd->cb);
+ kfree(kqd);
+}
+
+static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ struct kyber_hctx_data *khd;
+ int i;
+
+ khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
+ if (!khd)
+ return -ENOMEM;
+
+ spin_lock_init(&khd->lock);
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ INIT_LIST_HEAD(&khd->rqs[i]);
+ INIT_LIST_HEAD(&khd->domain_wait[i].task_list);
+ atomic_set(&khd->wait_index[i], 0);
+ }
+
+ khd->cur_domain = 0;
+ khd->batching = 0;
+
+ hctx->sched_data = khd;
+
+ return 0;
+}
+
+static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ kfree(hctx->sched_data);
+}
+
+static int rq_get_domain_token(struct request *rq)
+{
+ return (long)rq->elv.priv[0];
+}
+
+static void rq_set_domain_token(struct request *rq, int token)
+{
+ rq->elv.priv[0] = (void *)(long)token;
+}
+
+static void rq_clear_domain_token(struct kyber_queue_data *kqd,
+ struct request *rq)
+{
+ unsigned int sched_domain;
+ int nr;
+
+ nr = rq_get_domain_token(rq);
+ if (nr != -1) {
+ sched_domain = rq_sched_domain(rq);
+ sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
+ rq->mq_ctx->cpu);
+ }
+}
+
+static struct request *kyber_get_request(struct request_queue *q,
+ unsigned int op,
+ struct blk_mq_alloc_data *data)
+{
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+ struct request *rq;
+
+ /*
+ * We use the scheduler tags as per-hardware queue queueing tokens.
+ * Async requests can be limited at this stage.
+ */
+ if (!op_is_sync(op))
+ data->shallow_depth = kqd->async_depth;
+
+ rq = __blk_mq_alloc_request(data, op);
+ if (rq)
+ rq_set_domain_token(rq, -1);
+ return rq;
+}
+
+static void kyber_put_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+
+ rq_clear_domain_token(kqd, rq);
+ blk_mq_finish_request(rq);
+}
+
+static void kyber_completed_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct kyber_queue_data *kqd = q->elevator->elevator_data;
+ unsigned int sched_domain;
+ u64 now, latency, target;
+
+ /*
+ * Check if this request met our latency goal. If not, quickly gather
+ * some statistics and start throttling.
+ */
+ sched_domain = rq_sched_domain(rq);
+ switch (sched_domain) {
+ case KYBER_READ:
+ target = kqd->read_lat_nsec;
+ break;
+ case KYBER_SYNC_WRITE:
+ target = kqd->write_lat_nsec;
+ break;
+ default:
+ return;
+ }
+
+ /* If we are already monitoring latencies, don't check again. */
+ if (blk_stat_is_active(kqd->cb))
+ return;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ latency = now - blk_stat_time(&rq->issue_stat);
+
+ if (latency > target)
+ blk_stat_activate_msecs(kqd->cb, 10);
+}
+
+static void kyber_flush_busy_ctxs(struct kyber_hctx_data *khd,
+ struct blk_mq_hw_ctx *hctx)
+{
+ LIST_HEAD(rq_list);
+ struct request *rq, *next;
+
+ blk_mq_flush_busy_ctxs(hctx, &rq_list);
+ list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
+ unsigned int sched_domain;
+
+ sched_domain = rq_sched_domain(rq);
+ list_move_tail(&rq->queuelist, &khd->rqs[sched_domain]);
+ }
+}
+
+static int kyber_domain_wake(wait_queue_t *wait, unsigned mode, int flags,
+ void *key)
+{
+ struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private);
+
+ list_del_init(&wait->task_list);
+ blk_mq_run_hw_queue(hctx, true);
+ return 1;
+}
+
+static int kyber_get_domain_token(struct kyber_queue_data *kqd,
+ struct kyber_hctx_data *khd,
+ struct blk_mq_hw_ctx *hctx)
+{
+ unsigned int sched_domain = khd->cur_domain;
+ struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
+ wait_queue_t *wait = &khd->domain_wait[sched_domain];
+ struct sbq_wait_state *ws;
+ int nr;
+
+ nr = __sbitmap_queue_get(domain_tokens);
+ if (nr >= 0)
+ return nr;
+
+ /*
+ * If we failed to get a domain token, make sure the hardware queue is
+ * run when one becomes available. Note that this is serialized on
+ * khd->lock, but we still need to be careful about the waker.
+ */
+ if (list_empty_careful(&wait->task_list)) {
+ init_waitqueue_func_entry(wait, kyber_domain_wake);
+ wait->private = hctx;
+ ws = sbq_wait_ptr(domain_tokens,
+ &khd->wait_index[sched_domain]);
+ add_wait_queue(&ws->wait, wait);
+
+ /*
+ * Try again in case a token was freed before we got on the wait
+ * queue.
+ */
+ nr = __sbitmap_queue_get(domain_tokens);
+ }
+ return nr;
+}
+
+static struct request *
+kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
+ struct kyber_hctx_data *khd,
+ struct blk_mq_hw_ctx *hctx,
+ bool *flushed)
+{
+ struct list_head *rqs;
+ struct request *rq;
+ int nr;
+
+ rqs = &khd->rqs[khd->cur_domain];
+ rq = list_first_entry_or_null(rqs, struct request, queuelist);
+
+ /*
+ * If there wasn't already a pending request and we haven't flushed the
+ * software queues yet, flush the software queues and check again.
+ */
+ if (!rq && !*flushed) {
+ kyber_flush_busy_ctxs(khd, hctx);
+ *flushed = true;
+ rq = list_first_entry_or_null(rqs, struct request, queuelist);
+ }
+
+ if (rq) {
+ nr = kyber_get_domain_token(kqd, khd, hctx);
+ if (nr >= 0) {
+ khd->batching++;
+ rq_set_domain_token(rq, nr);
+ list_del_init(&rq->queuelist);
+ return rq;
+ }
+ }
+
+ /* There were either no pending requests or no tokens. */
+ return NULL;
+}
+
+static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ bool flushed = false;
+ struct request *rq;
+ int i;
+
+ spin_lock(&khd->lock);
+
+ /*
+ * First, if we are still entitled to batch, try to dispatch a request
+ * from the batch.
+ */
+ if (khd->batching < kyber_batch_size[khd->cur_domain]) {
+ rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+ if (rq)
+ goto out;
+ }
+
+ /*
+ * Either,
+ * 1. We were no longer entitled to a batch.
+ * 2. The domain we were batching didn't have any requests.
+ * 3. The domain we were batching was out of tokens.
+ *
+ * Start another batch. Note that this wraps back around to the original
+ * domain if no other domains have requests or tokens.
+ */
+ khd->batching = 0;
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
+ khd->cur_domain = 0;
+ else
+ khd->cur_domain++;
+
+ rq = kyber_dispatch_cur_domain(kqd, khd, hctx, &flushed);
+ if (rq)
+ goto out;
+ }
+
+ rq = NULL;
+out:
+ spin_unlock(&khd->lock);
+ return rq;
+}
+
+static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct kyber_hctx_data *khd = hctx->sched_data;
+ int i;
+
+ for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
+ if (!list_empty_careful(&khd->rqs[i]))
+ return true;
+ }
+ return false;
+}
+
+#define KYBER_LAT_SHOW_STORE(op) \
+static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
+ char *page) \
+{ \
+ struct kyber_queue_data *kqd = e->elevator_data; \
+ \
+ return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
+} \
+ \
+static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
+ const char *page, size_t count) \
+{ \
+ struct kyber_queue_data *kqd = e->elevator_data; \
+ unsigned long long nsec; \
+ int ret; \
+ \
+ ret = kstrtoull(page, 10, &nsec); \
+ if (ret) \
+ return ret; \
+ \
+ kqd->op##_lat_nsec = nsec; \
+ \
+ return count; \
+}
+KYBER_LAT_SHOW_STORE(read);
+KYBER_LAT_SHOW_STORE(write);
+#undef KYBER_LAT_SHOW_STORE
+
+#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
+static struct elv_fs_entry kyber_sched_attrs[] = {
+ KYBER_LAT_ATTR(read),
+ KYBER_LAT_ATTR(write),
+ __ATTR_NULL
+};
+#undef KYBER_LAT_ATTR
+
+static struct elevator_type kyber_sched = {
+ .ops.mq = {
+ .init_sched = kyber_init_sched,
+ .exit_sched = kyber_exit_sched,
+ .init_hctx = kyber_init_hctx,
+ .exit_hctx = kyber_exit_hctx,
+ .get_request = kyber_get_request,
+ .put_request = kyber_put_request,
+ .completed_request = kyber_completed_request,
+ .dispatch_request = kyber_dispatch_request,
+ .has_work = kyber_has_work,
+ },
+ .uses_mq = true,
+ .elevator_attrs = kyber_sched_attrs,
+ .elevator_name = "kyber",
+ .elevator_owner = THIS_MODULE,
+};
+
+static int __init kyber_init(void)
+{
+ return elv_register(&kyber_sched);
+}
+
+static void __exit kyber_exit(void)
+{
+ elv_unregister(&kyber_sched);
+}
+
+module_init(kyber_init);
+module_exit(kyber_exit);
+
+MODULE_AUTHOR("Omar Sandoval");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Kyber I/O scheduler");
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7afb9907821fb..0171a2faad681 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -497,7 +497,6 @@ rescan:
if (disk->fops->revalidate_disk)
disk->fops->revalidate_disk(disk);
- blk_integrity_revalidate(disk);
check_disk_size_change(disk, bdev);
bdev->bd_invalidated = 0;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 2a2fc768b27ad..4a294a5f7fab2 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -262,11 +262,11 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
/*
* fill in all the output members
*/
- hdr->status = rq->errors & 0xff;
- hdr->masked_status = status_byte(rq->errors);
- hdr->msg_status = msg_byte(rq->errors);
- hdr->host_status = host_byte(rq->errors);
- hdr->driver_status = driver_byte(rq->errors);
+ hdr->status = req->result & 0xff;
+ hdr->masked_status = status_byte(req->result);
+ hdr->msg_status = msg_byte(req->result);
+ hdr->host_status = host_byte(req->result);
+ hdr->driver_status = driver_byte(req->result);
hdr->info = 0;
if (hdr->masked_status || hdr->host_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
@@ -362,7 +362,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
goto out_free_cdb;
bio = rq->bio;
- rq->retries = 0;
+ req->retries = 0;
start_time = jiffies;
@@ -476,13 +476,13 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
goto error;
/* default. possible overriden later */
- rq->retries = 5;
+ req->retries = 5;
switch (opcode) {
case SEND_DIAGNOSTIC:
case FORMAT_UNIT:
rq->timeout = FORMAT_UNIT_TIMEOUT;
- rq->retries = 1;
+ req->retries = 1;
break;
case START_STOP:
rq->timeout = START_STOP_TIMEOUT;
@@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
break;
case READ_DEFECT_DATA:
rq->timeout = READ_DEFECT_DATA_TIMEOUT;
- rq->retries = 1;
+ req->retries = 1;
break;
default:
rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
@@ -509,7 +509,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
blk_execute_rq(q, disk, rq, 0);
- err = rq->errors & 0xff; /* only 8 bit SCSI status */
+ err = req->result & 0xff; /* only 8 bit SCSI status */
if (err) {
if (req->sense_len && req->sense) {
bytes = (OMAX_SB_LEN > req->sense_len) ?
@@ -547,7 +547,8 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
scsi_req(rq)->cmd[0] = cmd;
scsi_req(rq)->cmd[4] = data;
scsi_req(rq)->cmd_len = 6;
- err = blk_execute_rq(q, bd_disk, rq, 0);
+ blk_execute_rq(q, bd_disk, rq, 0);
+ err = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
return err;
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 14035f826b5e3..9b30ae5ab843b 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -275,8 +275,8 @@ static bool check_tper(const void *data)
u8 flags = tper->supported_features;
if (!(flags & TPER_SYNC_SUPPORTED)) {
- pr_err("TPer sync not supported. flags = %d\n",
- tper->supported_features);
+ pr_debug("TPer sync not supported. flags = %d\n",
+ tper->supported_features);
return false;
}
@@ -289,7 +289,7 @@ static bool check_sum(const void *data)
u32 nlo = be32_to_cpu(sum->num_locking_objects);
if (nlo == 0) {
- pr_err("Need at least one locking object.\n");
+ pr_debug("Need at least one locking object.\n");
return false;
}
@@ -385,9 +385,9 @@ static int next(struct opal_dev *dev)
error = step->fn(dev, step->data);
if (error) {
- pr_err("Error on step function: %d with error %d: %s\n",
- state, error,
- opal_error_to_human(error));
+ pr_debug("Error on step function: %d with error %d: %s\n",
+ state, error,
+ opal_error_to_human(error));
/* For each OPAL command we do a discovery0 then we
* start some sort of session.
@@ -419,8 +419,8 @@ static int opal_discovery0_end(struct opal_dev *dev)
print_buffer(dev->resp, hlen);
if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
- pr_warn("Discovery length overflows buffer (%zu+%u)/%u\n",
- sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
+ pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
+ sizeof(*hdr), hlen, IO_BUFFER_LENGTH);
return -EFAULT;
}
@@ -503,7 +503,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
if (*err)
return;
if (cmd->pos >= IO_BUFFER_LENGTH - 1) {
- pr_err("Error adding u8: end of buffer.\n");
+ pr_debug("Error adding u8: end of buffer.\n");
*err = -ERANGE;
return;
}
@@ -553,7 +553,7 @@ static void add_token_u64(int *err, struct opal_dev *cmd, u64 number)
len = DIV_ROUND_UP(msb, 4);
if (cmd->pos >= IO_BUFFER_LENGTH - len - 1) {
- pr_err("Error adding u64: end of buffer.\n");
+ pr_debug("Error adding u64: end of buffer.\n");
*err = -ERANGE;
return;
}
@@ -579,7 +579,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
}
if (len >= IO_BUFFER_LENGTH - cmd->pos - header_len) {
- pr_err("Error adding bytestring: end of buffer.\n");
+ pr_debug("Error adding bytestring: end of buffer.\n");
*err = -ERANGE;
return;
}
@@ -597,7 +597,7 @@ static void add_token_bytestring(int *err, struct opal_dev *cmd,
static int build_locking_range(u8 *buffer, size_t length, u8 lr)
{
if (length > OPAL_UID_LENGTH) {
- pr_err("Can't build locking range. Length OOB\n");
+ pr_debug("Can't build locking range. Length OOB\n");
return -ERANGE;
}
@@ -614,7 +614,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
static int build_locking_user(u8 *buffer, size_t length, u8 lr)
{
if (length > OPAL_UID_LENGTH) {
- pr_err("Can't build locking range user, Length OOB\n");
+ pr_debug("Can't build locking range user, Length OOB\n");
return -ERANGE;
}
@@ -648,7 +648,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
add_token_u8(&err, cmd, OPAL_ENDLIST);
if (err) {
- pr_err("Error finalizing command.\n");
+ pr_debug("Error finalizing command.\n");
return -EFAULT;
}
@@ -660,7 +660,7 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr));
while (cmd->pos % 4) {
if (cmd->pos >= IO_BUFFER_LENGTH) {
- pr_err("Error: Buffer overrun\n");
+ pr_debug("Error: Buffer overrun\n");
return -ERANGE;
}
cmd->cmd[cmd->pos++] = 0;
@@ -679,14 +679,14 @@ static const struct opal_resp_tok *response_get_token(
const struct opal_resp_tok *tok;
if (n >= resp->num) {
- pr_err("Token number doesn't exist: %d, resp: %d\n",
- n, resp->num);
+ pr_debug("Token number doesn't exist: %d, resp: %d\n",
+ n, resp->num);
return ERR_PTR(-EINVAL);
}
tok = &resp->toks[n];
if (tok->len == 0) {
- pr_err("Token length must be non-zero\n");
+ pr_debug("Token length must be non-zero\n");
return ERR_PTR(-EINVAL);
}
@@ -727,7 +727,7 @@ static ssize_t response_parse_short(struct opal_resp_tok *tok,
tok->type = OPAL_DTA_TOKENID_UINT;
if (tok->len > 9) {
- pr_warn("uint64 with more than 8 bytes\n");
+ pr_debug("uint64 with more than 8 bytes\n");
return -EINVAL;
}
for (i = tok->len - 1; i > 0; i--) {
@@ -814,8 +814,8 @@ static int response_parse(const u8 *buf, size_t length,
if (clen == 0 || plen == 0 || slen == 0 ||
slen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
- pr_err("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
- clen, plen, slen);
+ pr_debug("Bad header length. cp: %u, pkt: %u, subpkt: %u\n",
+ clen, plen, slen);
print_buffer(pos, sizeof(*hdr));
return -EINVAL;
}
@@ -848,7 +848,7 @@ static int response_parse(const u8 *buf, size_t length,
}
if (num_entries == 0) {
- pr_err("Couldn't parse response.\n");
+ pr_debug("Couldn't parse response.\n");
return -EINVAL;
}
resp->num = num_entries;
@@ -861,18 +861,18 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
{
*store = NULL;
if (!resp) {
- pr_err("Response is NULL\n");
+ pr_debug("Response is NULL\n");
return 0;
}
if (n > resp->num) {
- pr_err("Response has %d tokens. Can't access %d\n",
- resp->num, n);
+ pr_debug("Response has %d tokens. Can't access %d\n",
+ resp->num, n);
return 0;
}
if (resp->toks[n].type != OPAL_DTA_TOKENID_BYTESTRING) {
- pr_err("Token is not a byte string!\n");
+ pr_debug("Token is not a byte string!\n");
return 0;
}
@@ -883,26 +883,26 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
static u64 response_get_u64(const struct parsed_resp *resp, int n)
{
if (!resp) {
- pr_err("Response is NULL\n");
+ pr_debug("Response is NULL\n");
return 0;
}
if (n > resp->num) {
- pr_err("Response has %d tokens. Can't access %d\n",
- resp->num, n);
+ pr_debug("Response has %d tokens. Can't access %d\n",
+ resp->num, n);
return 0;
}
if (resp->toks[n].type != OPAL_DTA_TOKENID_UINT) {
- pr_err("Token is not unsigned it: %d\n",
- resp->toks[n].type);
+ pr_debug("Token is not unsigned it: %d\n",
+ resp->toks[n].type);
return 0;
}
if (!(resp->toks[n].width == OPAL_WIDTH_TINY ||
resp->toks[n].width == OPAL_WIDTH_SHORT)) {
- pr_err("Atom is not short or tiny: %d\n",
- resp->toks[n].width);
+ pr_debug("Atom is not short or tiny: %d\n",
+ resp->toks[n].width);
return 0;
}
@@ -949,7 +949,7 @@ static int parse_and_check_status(struct opal_dev *dev)
error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed);
if (error) {
- pr_err("Couldn't parse response.\n");
+ pr_debug("Couldn't parse response.\n");
return error;
}
@@ -975,7 +975,7 @@ static int start_opal_session_cont(struct opal_dev *dev)
tsn = response_get_u64(&dev->parsed, 5);
if (hsn == 0 && tsn == 0) {
- pr_err("Couldn't authenticate session\n");
+ pr_debug("Couldn't authenticate session\n");
return -EPERM;
}
@@ -1012,7 +1012,7 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
ret = cmd_finalize(dev, dev->hsn, dev->tsn);
if (ret) {
- pr_err("Error finalizing command buffer: %d\n", ret);
+ pr_debug("Error finalizing command buffer: %d\n", ret);
return ret;
}
@@ -1041,7 +1041,7 @@ static int gen_key(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building gen key command\n");
+ pr_debug("Error building gen key command\n");
return err;
}
@@ -1059,8 +1059,8 @@ static int get_active_key_cont(struct opal_dev *dev)
return error;
keylen = response_get_string(&dev->parsed, 4, &activekey);
if (!activekey) {
- pr_err("%s: Couldn't extract the Activekey from the response\n",
- __func__);
+ pr_debug("%s: Couldn't extract the Activekey from the response\n",
+ __func__);
return OPAL_INVAL_PARAM;
}
dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
@@ -1103,7 +1103,7 @@ static int get_active_key(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building get active key command\n");
+ pr_debug("Error building get active key command\n");
return err;
}
@@ -1159,7 +1159,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE,
0, 0);
if (err)
- pr_err("Failed to create enable global lr command\n");
+ pr_debug("Failed to create enable global lr command\n");
return err;
}
@@ -1217,7 +1217,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
}
if (err) {
- pr_err("Error building Setup Locking range command.\n");
+ pr_debug("Error building Setup Locking range command.\n");
return err;
}
@@ -1234,11 +1234,8 @@ static int start_generic_opal_session(struct opal_dev *dev,
u32 hsn;
int err = 0;
- if (key == NULL && auth != OPAL_ANYBODY_UID) {
- pr_err("%s: Attempted to open ADMIN_SP Session without a Host" \
- "Challenge, and not as the Anybody UID\n", __func__);
+ if (key == NULL && auth != OPAL_ANYBODY_UID)
return OPAL_INVAL_PARAM;
- }
clear_opal_cmd(dev);
@@ -1273,12 +1270,12 @@ static int start_generic_opal_session(struct opal_dev *dev,
add_token_u8(&err, dev, OPAL_ENDLIST);
break;
default:
- pr_err("Cannot start Admin SP session with auth %d\n", auth);
+ pr_debug("Cannot start Admin SP session with auth %d\n", auth);
return OPAL_INVAL_PARAM;
}
if (err) {
- pr_err("Error building start adminsp session command.\n");
+ pr_debug("Error building start adminsp session command.\n");
return err;
}
@@ -1369,7 +1366,7 @@ static int start_auth_opal_session(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building STARTSESSION command.\n");
+ pr_debug("Error building STARTSESSION command.\n");
return err;
}
@@ -1391,7 +1388,7 @@ static int revert_tper(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building REVERT TPER command.\n");
+ pr_debug("Error building REVERT TPER command.\n");
return err;
}
@@ -1426,7 +1423,7 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building Activate UserN command.\n");
+ pr_debug("Error building Activate UserN command.\n");
return err;
}
@@ -1453,7 +1450,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building Erase Locking Range Command.\n");
+ pr_debug("Error building Erase Locking Range Command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1484,7 +1481,7 @@ static int set_mbr_done(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error Building set MBR Done command\n");
+ pr_debug("Error Building set MBR Done command\n");
return err;
}
@@ -1516,7 +1513,7 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error Building set MBR done command\n");
+ pr_debug("Error Building set MBR done command\n");
return err;
}
@@ -1567,7 +1564,7 @@ static int set_new_pw(struct opal_dev *dev, void *data)
if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len,
cpin_uid, dev)) {
- pr_err("Error building set password command.\n");
+ pr_debug("Error building set password command.\n");
return -ERANGE;
}
@@ -1582,7 +1579,7 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data)
memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH);
if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) {
- pr_err("Error building Set SID cpin\n");
+ pr_debug("Error building Set SID cpin\n");
return -ERANGE;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1657,7 +1654,7 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building add user to locking range command.\n");
+ pr_debug("Error building add user to locking range command.\n");
return err;
}
@@ -1691,7 +1688,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
/* vars are initalized to locked */
break;
default:
- pr_err("Tried to set an invalid locking state... returning to uland\n");
+ pr_debug("Tried to set an invalid locking state... returning to uland\n");
return OPAL_INVAL_PARAM;
}
@@ -1718,7 +1715,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building SET command.\n");
+ pr_debug("Error building SET command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1752,14 +1749,14 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
/* vars are initalized to locked */
break;
default:
- pr_err("Tried to set an invalid locking state.\n");
+ pr_debug("Tried to set an invalid locking state.\n");
return OPAL_INVAL_PARAM;
}
ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1,
read_locked, write_locked);
if (ret < 0) {
- pr_err("Error building SET command.\n");
+ pr_debug("Error building SET command.\n");
return ret;
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1811,7 +1808,7 @@ static int activate_lsp(struct opal_dev *dev, void *data)
}
if (err) {
- pr_err("Error building Activate LockingSP command.\n");
+ pr_debug("Error building Activate LockingSP command.\n");
return err;
}
@@ -1831,7 +1828,7 @@ static int get_lsp_lifecycle_cont(struct opal_dev *dev)
/* 0x08 is Manufacured Inactive */
/* 0x09 is Manufactured */
if (lc_status != OPAL_MANUFACTURED_INACTIVE) {
- pr_err("Couldn't determine the status of the Lifcycle state\n");
+ pr_debug("Couldn't determine the status of the Lifecycle state\n");
return -ENODEV;
}
@@ -1868,7 +1865,7 @@ static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error Building GET Lifecycle Status command\n");
+ pr_debug("Error Building GET Lifecycle Status command\n");
return err;
}
@@ -1887,7 +1884,7 @@ static int get_msid_cpin_pin_cont(struct opal_dev *dev)
strlen = response_get_string(&dev->parsed, 4, &msid_pin);
if (!msid_pin) {
- pr_err("%s: Couldn't extract PIN from response\n", __func__);
+ pr_debug("%s: Couldn't extract PIN from response\n", __func__);
return OPAL_INVAL_PARAM;
}
@@ -1929,7 +1926,7 @@ static int get_msid_cpin_pin(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_ENDLIST);
if (err) {
- pr_err("Error building Get MSID CPIN PIN command.\n");
+ pr_debug("Error building Get MSID CPIN PIN command.\n");
return err;
}
@@ -2124,18 +2121,18 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
if (lk_unlk->l_state != OPAL_RO &&
lk_unlk->l_state != OPAL_RW) {
- pr_err("Locking state was not RO or RW\n");
+ pr_debug("Locking state was not RO or RW\n");
return -EINVAL;
}
if (lk_unlk->session.who < OPAL_USER1 ||
lk_unlk->session.who > OPAL_USER9) {
- pr_err("Authority was not within the range of users: %d\n",
- lk_unlk->session.who);
+ pr_debug("Authority was not within the range of users: %d\n",
+ lk_unlk->session.who);
return -EINVAL;
}
if (lk_unlk->session.sum) {
- pr_err("%s not supported in sum. Use setup locking range\n",
- __func__);
+ pr_debug("%s not supported in sum. Use setup locking range\n",
+ __func__);
return -EINVAL;
}
@@ -2312,7 +2309,7 @@ static int opal_activate_user(struct opal_dev *dev,
/* We can't activate Admin1 it's active as manufactured */
if (opal_session->who < OPAL_USER1 ||
opal_session->who > OPAL_USER9) {
- pr_err("Who was not a valid user: %d\n", opal_session->who);
+ pr_debug("Who was not a valid user: %d\n", opal_session->who);
return -EINVAL;
}
@@ -2343,9 +2340,9 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
ret = __opal_lock_unlock(dev, &suspend->unlk);
if (ret) {
- pr_warn("Failed to unlock LR %hhu with sum %d\n",
- suspend->unlk.session.opal_key.lr,
- suspend->unlk.session.sum);
+ pr_debug("Failed to unlock LR %hhu with sum %d\n",
+ suspend->unlk.session.opal_key.lr,
+ suspend->unlk.session.sum);
was_failure = true;
}
}
@@ -2363,10 +2360,8 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
return -EACCES;
if (!dev)
return -ENOTSUPP;
- if (!dev->supported) {
- pr_err("Not supported\n");
+ if (!dev->supported)
return -ENOTSUPP;
- }
p = memdup_user(arg, _IOC_SIZE(cmd));
if (IS_ERR(p))
@@ -2410,7 +2405,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
ret = opal_secure_erase_locking_range(dev, p);
break;
default:
- pr_warn("No such Opal Ioctl %u\n", cmd);
+ break;
}
kfree(p);
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 2c97912335a90..680c6d6362983 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -160,28 +160,28 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
return t10_pi_verify(iter, t10_pi_ip_fn, 3);
}
-struct blk_integrity_profile t10_pi_type1_crc = {
+const struct blk_integrity_profile t10_pi_type1_crc = {
.name = "T10-DIF-TYPE1-CRC",
.generate_fn = t10_pi_type1_generate_crc,
.verify_fn = t10_pi_type1_verify_crc,
};
EXPORT_SYMBOL(t10_pi_type1_crc);
-struct blk_integrity_profile t10_pi_type1_ip = {
+const struct blk_integrity_profile t10_pi_type1_ip = {
.name = "T10-DIF-TYPE1-IP",
.generate_fn = t10_pi_type1_generate_ip,
.verify_fn = t10_pi_type1_verify_ip,
};
EXPORT_SYMBOL(t10_pi_type1_ip);
-struct blk_integrity_profile t10_pi_type3_crc = {
+const struct blk_integrity_profile t10_pi_type3_crc = {
.name = "T10-DIF-TYPE3-CRC",
.generate_fn = t10_pi_type3_generate_crc,
.verify_fn = t10_pi_type3_verify_crc,
};
EXPORT_SYMBOL(t10_pi_type3_crc);
-struct blk_integrity_profile t10_pi_type3_ip = {
+const struct blk_integrity_profile t10_pi_type3_ip = {
.name = "T10-DIF-TYPE3-IP",
.generate_fn = t10_pi_type3_generate_ip,
.verify_fn = t10_pi_type3_verify_ip,
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index f744de7a0f9b2..19df4918e37ea 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -312,22 +312,6 @@ config BLK_DEV_SKD
Use device /dev/skd$N amd /dev/skd$Np$M.
-config BLK_DEV_OSD
- tristate "OSD object-as-blkdev support"
- depends on SCSI_OSD_ULD
- ---help---
- Saying Y or M here will allow the exporting of a single SCSI
- OSD (object-based storage) object as a Linux block device.
-
- For example, if you create a 2G object on an OSD device,
- you can then use this module to present that 2G object as
- a Linux block device.
-
- To compile this driver as a module, choose M here: the
- module will be called osdblk.
-
- If unsure, say N.
-
config BLK_DEV_SX8
tristate "Promise SATA SX8 support"
depends on PCI
@@ -434,23 +418,6 @@ config ATA_OVER_ETH
This driver provides Support for ATA over Ethernet block
devices like the Coraid EtherDrive (R) Storage Blade.
-config MG_DISK
- tristate "mGine mflash, gflash support"
- depends on ARM && GPIOLIB
- help
- mGine mFlash(gFlash) block device driver
-
-config MG_DISK_RES
- int "Size of reserved area before MBR"
- depends on MG_DISK
- default 0
- help
- Define size of reserved area that usually used for boot. Unit is KB.
- All of the block device operation will be taken this value as start
- offset
- Examples:
- 1024 => 1 MB
-
config SUNVDC
tristate "Sun Virtual Disk Client support"
depends on SUN_LDOMS
@@ -512,19 +479,7 @@ config VIRTIO_BLK_SCSI
Enable support for SCSI passthrough (e.g. the SG_IO ioctl) on
virtio-blk devices. This is only supported for the legacy
virtio protocol and not enabled by default by any hypervisor.
- Your probably want to virtio-scsi instead.
-
-config BLK_DEV_HD
- bool "Very old hard disk (MFM/RLL/IDE) driver"
- depends on HAVE_IDE
- depends on !ARM || ARCH_RPC || BROKEN
- help
- This is a very old hard disk driver that lacks the enhanced
- functionality of the newer ones.
-
- It is required for systems with ancient MFM/RLL/ESDI drives.
-
- If unsure, say N.
+ You probably want to use virtio-scsi instead.
config BLK_DEV_RBD
tristate "Rados block device (RBD)"
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 1e9661e26f294..ec8c36897b753 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -19,10 +19,8 @@ obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o
obj-$(CONFIG_XILINX_SYSACE) += xsysace.o
obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
-obj-$(CONFIG_MG_DISK) += mg_disk.o
obj-$(CONFIG_SUNVDC) += sunvdc.o
obj-$(CONFIG_BLK_DEV_SKD) += skd.o
-obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
@@ -30,7 +28,6 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
-obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 2104b1b4ccda2..fa69ecd52cb57 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -617,12 +617,12 @@ static void fd_error( void )
if (!fd_request)
return;
- fd_request->errors++;
- if (fd_request->errors >= MAX_ERRORS) {
+ fd_request->error_count++;
+ if (fd_request->error_count >= MAX_ERRORS) {
printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
fd_end_request_cur(-EIO);
}
- else if (fd_request->errors == RECALIBRATE_ERRORS) {
+ else if (fd_request->error_count == RECALIBRATE_ERRORS) {
printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
if (SelectedDrive != -1)
SUD.track = -1;
@@ -1386,7 +1386,7 @@ static void setup_req_params( int drive )
ReqData = ReqBuffer + 512 * ReqCnt;
if (UseTrackbuffer)
- read_track = (ReqCmd == READ && fd_request->errors == 0);
+ read_track = (ReqCmd == READ && fd_request->error_count == 0);
else
read_track = 0;
@@ -1409,8 +1409,10 @@ static struct request *set_next_request(void)
fdc_queue = 0;
if (q) {
rq = blk_fetch_request(q);
- if (rq)
+ if (rq) {
+ rq->error_count = 0;
break;
+ }
}
} while (fdc_queue != old_pos);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 3adc32a3153b2..4ec84d504780d 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -134,28 +134,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
return page;
}
-static void brd_free_page(struct brd_device *brd, sector_t sector)
-{
- struct page *page;
- pgoff_t idx;
-
- spin_lock(&brd->brd_lock);
- idx = sector >> PAGE_SECTORS_SHIFT;
- page = radix_tree_delete(&brd->brd_pages, idx);
- spin_unlock(&brd->brd_lock);
- if (page)
- __free_page(page);
-}
-
-static void brd_zero_page(struct brd_device *brd, sector_t sector)
-{
- struct page *page;
-
- page = brd_lookup_page(brd, sector);
- if (page)
- clear_highpage(page);
-}
-
/*
* Free all backing store pages and radix tree. This must only be called when
* there are no other users of the device.
@@ -212,24 +190,6 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
return 0;
}
-static void discard_from_brd(struct brd_device *brd,
- sector_t sector, size_t n)
-{
- while (n >= PAGE_SIZE) {
- /*
- * Don't want to actually discard pages here because
- * re-allocating the pages can result in writeback
- * deadlocks under heavy load.
- */
- if (0)
- brd_free_page(brd, sector);
- else
- brd_zero_page(brd, sector);
- sector += PAGE_SIZE >> SECTOR_SHIFT;
- n -= PAGE_SIZE;
- }
-}
-
/*
* Copy n bytes from src to the brd starting at sector. Does not sleep.
*/
@@ -338,14 +298,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
goto io_error;
- if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
- if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) ||
- bio->bi_iter.bi_size & ~PAGE_MASK)
- goto io_error;
- discard_from_brd(brd, sector, bio->bi_iter.bi_size);
- goto out;
- }
-
bio_for_each_segment(bvec, bio, iter) {
unsigned int len = bvec.bv_len;
int err;
@@ -357,7 +309,6 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
sector += len >> SECTOR_SHIFT;
}
-out:
bio_endio(bio);
return BLK_QC_T_NONE;
io_error:
@@ -464,11 +415,6 @@ static struct brd_device *brd_alloc(int i)
* is harmless)
*/
blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
-
- brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
- blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
- brd->brd_queue->limits.discard_zeroes_data = 1;
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
#ifdef CONFIG_BLK_DEV_RAM_DAX
queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
#endif
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8e1a4554951c0..cd375503f7b0d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1864,8 +1864,7 @@ static void cciss_softirq_done(struct request *rq)
/* set the residual count for pc requests */
if (blk_rq_is_passthrough(rq))
scsi_req(rq)->resid_len = c->err_info->ResidualCnt;
-
- blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
+ blk_end_request_all(rq, scsi_req(rq)->result ? -EIO : 0);
spin_lock_irqsave(&h->lock, flags);
cmd_free(h, c);
@@ -3140,18 +3139,19 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
{
int retry_cmd = 0;
struct request *rq = cmd->rq;
+ struct scsi_request *sreq = scsi_req(rq);
- rq->errors = 0;
+ sreq->result = 0;
if (timeout)
- rq->errors = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
+ sreq->result = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
if (cmd->err_info->CommandStatus == 0) /* no error has occurred */
goto after_error_processing;
switch (cmd->err_info->CommandStatus) {
case CMD_TARGET_STATUS:
- rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
+ sreq->result = evaluate_target_status(h, cmd, &retry_cmd);
break;
case CMD_DATA_UNDERRUN:
if (!blk_rq_is_passthrough(cmd->rq)) {
@@ -3169,7 +3169,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_INVALID:
dev_warn(&h->pdev->dev, "cciss: cmd %p is "
"reported invalid\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3177,7 +3177,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_PROTOCOL_ERR:
dev_warn(&h->pdev->dev, "cciss: cmd %p has "
"protocol error\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3185,7 +3185,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_HARDWARE_ERR:
dev_warn(&h->pdev->dev, "cciss: cmd %p had "
" hardware error\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3193,7 +3193,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_CONNECTION_LOST:
dev_warn(&h->pdev->dev, "cciss: cmd %p had "
"connection lost\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3201,7 +3201,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_ABORTED:
dev_warn(&h->pdev->dev, "cciss: cmd %p was "
"aborted\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ABORT);
@@ -3209,7 +3209,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
case CMD_ABORT_FAILED:
dev_warn(&h->pdev->dev, "cciss: cmd %p reports "
"abort failed\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3224,21 +3224,21 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
} else
dev_warn(&h->pdev->dev,
"%p retried too many times\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ABORT);
break;
case CMD_TIMEOUT:
dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
break;
case CMD_UNABORTABLE:
dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3247,7 +3247,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
dev_warn(&h->pdev->dev, "cmd %p returned "
"unknown status %x\n", cmd,
cmd->err_info->CommandStatus);
- rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ sreq->result = make_status_bytes(SAM_STAT_GOOD,
cmd->err_info->CommandStatus, DRIVER_OK,
blk_rq_is_passthrough(cmd->rq) ?
DID_PASSTHROUGH : DID_ERROR);
@@ -3380,9 +3380,9 @@ static void do_cciss_request(struct request_queue *q)
if (dma_mapping_error(&h->pdev->dev, temp64.val)) {
dev_warn(&h->pdev->dev,
"%s: error mapping page for DMA\n", __func__);
- creq->errors = make_status_bytes(SAM_STAT_GOOD,
- 0, DRIVER_OK,
- DID_SOFT_ERROR);
+ scsi_req(creq)->result =
+ make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+ DID_SOFT_ERROR);
cmd_free(h, c);
return;
}
@@ -3395,9 +3395,9 @@ static void do_cciss_request(struct request_queue *q)
if (cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
(seg - (h->max_cmd_sgentries - 1)) *
sizeof(SGDescriptor_struct))) {
- creq->errors = make_status_bytes(SAM_STAT_GOOD,
- 0, DRIVER_OK,
- DID_SOFT_ERROR);
+ scsi_req(creq)->result =
+ make_status_bytes(SAM_STAT_GOOD, 0, DRIVER_OK,
+ DID_SOFT_ERROR);
cmd_free(h, c);
return;
}
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index de5c3ee8a7906..494837e59f232 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -236,9 +236,6 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
-
- if (f & EE_IS_TRIM)
- __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
seq_putc(m, '\n');
}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 724d1c50fc528..d5da45bb03a66 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -437,9 +437,6 @@ enum {
/* is this a TRIM aka REQ_DISCARD? */
__EE_IS_TRIM,
- /* our lower level cannot handle trim,
- * and we want to fall back to zeroout instead */
- __EE_IS_TRIM_USE_ZEROOUT,
/* In case a barrier failed,
* we need to resubmit without the barrier flag. */
@@ -482,7 +479,6 @@ enum {
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_IS_TRIM (1<<__EE_IS_TRIM)
-#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
@@ -1561,8 +1557,6 @@ extern void start_resync_timer_fn(unsigned long data);
extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
/* drbd_receiver.c */
-extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
- sector_t start, unsigned int nr_sectors, bool discard);
extern int drbd_receiver(struct drbd_thread *thi);
extern int drbd_ack_receiver(struct drbd_thread *thi);
extern void drbd_send_ping_wf(struct work_struct *ws);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 92c60cbd04ee8..84455c365f578 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -931,7 +931,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = blk_queue_discard(q);
- p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
} else {
q = device->rq_queue;
@@ -941,7 +940,6 @@ void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct r
p->qlim->io_min = cpu_to_be32(queue_io_min(q));
p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
p->qlim->discard_enabled = 0;
- p->qlim->discard_zeroes_data = 0;
p->qlim->write_same_capable = 0;
}
}
@@ -1668,7 +1666,8 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
- (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
+ (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
+ (bio_op(bio) == REQ_OP_WRITE_ZEROES ? DP_DISCARD : 0);
else
return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 908c704e20aa7..02255a0d68b9a 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1199,10 +1199,6 @@ static void decide_on_discard_support(struct drbd_device *device,
struct drbd_connection *connection = first_peer_device(device)->connection;
bool can_do = b ? blk_queue_discard(b) : true;
- if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) {
- can_do = false;
- drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
- }
if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
can_do = false;
drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
@@ -1217,10 +1213,12 @@ static void decide_on_discard_support(struct drbd_device *device,
blk_queue_discard_granularity(q, 512);
q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ q->limits.max_write_zeroes_sectors = drbd_max_discard_sectors(connection);
} else {
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
blk_queue_discard_granularity(q, 0);
q->limits.max_discard_sectors = 0;
+ q->limits.max_write_zeroes_sectors = 0;
}
}
@@ -1482,8 +1480,7 @@ static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *dis
if (disk_conf->al_extents > drbd_al_extents_max(nbc))
disk_conf->al_extents = drbd_al_extents_max(nbc);
- if (!blk_queue_discard(q)
- || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) {
+ if (!blk_queue_discard(q)) {
if (disk_conf->rs_discard_granularity) {
disk_conf->rs_discard_granularity = 0; /* disable feature */
drbd_info(device, "rs_discard_granularity feature disabled\n");
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index aa6bf9692effe..1b0a2be24f39e 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1448,105 +1448,14 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
}
-/*
- * We *may* ignore the discard-zeroes-data setting, if so configured.
- *
- * Assumption is that it "discard_zeroes_data=0" is only because the backend
- * may ignore partial unaligned discards.
- *
- * LVM/DM thin as of at least
- * LVM version: 2.02.115(2)-RHEL7 (2015-01-28)
- * Library version: 1.02.93-RHEL7 (2015-01-28)
- * Driver version: 4.29.0
- * still behaves this way.
- *
- * For unaligned (wrt. alignment and granularity) or too small discards,
- * we zero-out the initial (and/or) trailing unaligned partial chunks,
- * but discard all the aligned full chunks.
- *
- * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1".
- */
-int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard)
-{
- struct block_device *bdev = device->ldev->backing_bdev;
- struct request_queue *q = bdev_get_queue(bdev);
- sector_t tmp, nr;
- unsigned int max_discard_sectors, granularity;
- int alignment;
- int err = 0;
-
- if (!discard)
- goto zero_out;
-
- /* Zero-sector (unknown) and one-sector granularities are the same. */
- granularity = max(q->limits.discard_granularity >> 9, 1U);
- alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
-
- max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
- max_discard_sectors -= max_discard_sectors % granularity;
- if (unlikely(!max_discard_sectors))
- goto zero_out;
-
- if (nr_sectors < granularity)
- goto zero_out;
-
- tmp = start;
- if (sector_div(tmp, granularity) != alignment) {
- if (nr_sectors < 2*granularity)
- goto zero_out;
- /* start + gran - (start + gran - align) % gran */
- tmp = start + granularity - alignment;
- tmp = start + granularity - sector_div(tmp, granularity);
-
- nr = tmp - start;
- err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
- nr_sectors -= nr;
- start = tmp;
- }
- while (nr_sectors >= granularity) {
- nr = min_t(sector_t, nr_sectors, max_discard_sectors);
- err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
- nr_sectors -= nr;
- start += nr;
- }
- zero_out:
- if (nr_sectors) {
- err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0);
- }
- return err != 0;
-}
-
-static bool can_do_reliable_discards(struct drbd_device *device)
-{
- struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
- struct disk_conf *dc;
- bool can_do;
-
- if (!blk_queue_discard(q))
- return false;
-
- if (q->limits.discard_zeroes_data)
- return true;
-
- rcu_read_lock();
- dc = rcu_dereference(device->ldev->disk_conf);
- can_do = dc->discard_zeroes_if_aligned;
- rcu_read_unlock();
- return can_do;
-}
-
static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
- /* If the backend cannot discard, or does not guarantee
- * read-back zeroes in discarded ranges, we fall back to
- * zero-out. Unless configuration specifically requested
- * otherwise. */
- if (!can_do_reliable_discards(device))
- peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+ struct block_device *bdev = device->ldev->backing_bdev;
- if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
- peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT)))
+ if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9,
+ GFP_NOIO, 0))
peer_req->flags |= EE_WAS_ERROR;
+
drbd_endio_write_sec_final(peer_req);
}
@@ -2376,7 +2285,7 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf)
static unsigned long wire_flags_to_bio_op(u32 dpf)
{
if (dpf & DP_DISCARD)
- return REQ_OP_DISCARD;
+ return REQ_OP_WRITE_ZEROES;
else
return REQ_OP_WRITE;
}
@@ -2567,7 +2476,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
op_flags = wire_flags_to_bio_flags(dp_flags);
if (pi->cmd == P_TRIM) {
D_ASSERT(peer_device, peer_req->i.size > 0);
- D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+ D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
D_ASSERT(peer_device, peer_req->pages == NULL);
} else if (peer_req->pages == NULL) {
D_ASSERT(device, peer_req->i.size == 0);
@@ -4880,7 +4789,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
if (get_ldev(device)) {
struct drbd_peer_request *peer_req;
- const int op = REQ_OP_DISCARD;
+ const int op = REQ_OP_WRITE_ZEROES;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
size, 0, GFP_NOIO);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 652114ae1a8ae..b5730e17b4558 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,6 +59,7 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio
drbd_req_make_private_bio(req, bio_src);
req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
| (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0)
+ | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_UNMAP : 0)
| (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
req->device = device;
req->master_bio = bio_src;
@@ -1148,10 +1149,10 @@ static int drbd_process_write_request(struct drbd_request *req)
static void drbd_process_discard_req(struct drbd_request *req)
{
- int err = drbd_issue_discard_or_zero_out(req->device,
- req->i.sector, req->i.size >> 9, true);
+ struct block_device *bdev = req->device->ldev->backing_bdev;
- if (err)
+ if (blkdev_issue_zeroout(bdev, req->i.sector, req->i.size >> 9,
+ GFP_NOIO, 0))
req->private_bio->bi_error = -EIO;
bio_endio(req->private_bio);
}
@@ -1180,7 +1181,8 @@ drbd_submit_req_private_bio(struct drbd_request *req)
if (get_ldev(device)) {
if (drbd_insert_fault(device, type))
bio_io_error(bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
+ else if (bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+ bio_op(bio) == REQ_OP_DISCARD)
drbd_process_discard_req(req);
else
generic_make_request(bio);
@@ -1234,7 +1236,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long
_drbd_start_io_acct(device, req);
/* process discards always from our submitter thread */
- if (bio_op(bio) & REQ_OP_DISCARD)
+ if ((bio_op(bio) & REQ_OP_WRITE_ZEROES) ||
+ (bio_op(bio) & REQ_OP_DISCARD))
goto queue_for_submitter_thread;
if (rw == WRITE && req->private_bio && req->i.size
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 3bff33f21435c..1afcb4e02d8d9 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -174,7 +174,8 @@ void drbd_peer_request_endio(struct bio *bio)
struct drbd_peer_request *peer_req = bio->bi_private;
struct drbd_device *device = peer_req->peer_device->device;
bool is_write = bio_data_dir(bio) == WRITE;
- bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD);
+ bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+ bio_op(bio) == REQ_OP_DISCARD;
if (bio->bi_error && __ratelimit(&drbd_ratelimit_state))
drbd_warn(device, "%s: error=%d s=%llus\n",
@@ -249,6 +250,7 @@ void drbd_request_endio(struct bio *bio)
/* to avoid recursion in __req_mod */
if (unlikely(bio->bi_error)) {
switch (bio_op(bio)) {
+ case REQ_OP_WRITE_ZEROES:
case REQ_OP_DISCARD:
if (bio->bi_error == -EOPNOTSUPP)
what = DISCARD_COMPLETED_NOTSUPP;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 45b4384f650cc..60d4c76531783 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2805,8 +2805,10 @@ static int set_next_request(void)
fdc_queue = 0;
if (q) {
current_req = blk_fetch_request(q);
- if (current_req)
+ if (current_req) {
+ current_req->error_count = 0;
break;
+ }
}
} while (fdc_queue != old_pos);
@@ -2866,7 +2868,7 @@ do_request:
_floppy = floppy_type + DP->autodetect[DRS->probed_format];
} else
probing = 0;
- errors = &(current_req->errors);
+ errors = &(current_req->error_count);
tmp = make_raw_rw_request();
if (tmp < 2) {
request_done(tmp);
@@ -4207,9 +4209,7 @@ static int __init do_floppy_init(void)
disks[drive]->fops = &floppy_fops;
sprintf(disks[drive]->disk_name, "fd%d", drive);
- init_timer(&motor_off_timer[drive]);
- motor_off_timer[drive].data = drive;
- motor_off_timer[drive].function = motor_off_callback;
+ setup_timer(&motor_off_timer[drive], motor_off_callback, drive);
}
err = register_blkdev(FLOPPY_MAJOR, "fd");
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
deleted file mode 100644
index 6043648da1e85..0000000000000
--- a/drivers/block/hd.c
+++ /dev/null
@@ -1,803 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * This is the low-level hd interrupt support. It traverses the
- * request-list, using interrupts to jump between functions. As
- * all the functions are called within interrupts, we may not
- * sleep. Special care is recommended.
- *
- * modified by Drew Eckhardt to check nr of hd's from the CMOS.
- *
- * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
- * in the early extended-partition checks and added DM partitions
- *
- * IRQ-unmask, drive-id, multiple-mode, support for ">16 heads",
- * and general streamlining by Mark Lord.
- *
- * Removed 99% of above. Use Mark's ide driver for those options.
- * This is now a lightweight ST-506 driver. (Paul Gortmaker)
- *
- * Modified 1995 Russell King for ARM processor.
- *
- * Bugfix: max_sectors must be <= 255 or the wheels tend to come
- * off in a hurry once you queue things up - Paul G. 02/2001
- */
-
-/* Uncomment the following if you want verbose error reports. */
-/* #define VERBOSE_ERRORS */
-
-#include <linux/blkdev.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/interrupt.h>
-#include <linux/timer.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/genhd.h>
-#include <linux/string.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/blkpg.h>
-#include <linux/ata.h>
-#include <linux/hdreg.h>
-
-#define HD_IRQ 14
-
-#define REALLY_SLOW_IO
-#include <asm/io.h>
-#include <linux/uaccess.h>
-
-#ifdef __arm__
-#undef HD_IRQ
-#endif
-#include <asm/irq.h>
-#ifdef __arm__
-#define HD_IRQ IRQ_HARDDISK
-#endif
-
-/* Hd controller regster ports */
-
-#define HD_DATA 0x1f0 /* _CTL when writing */
-#define HD_ERROR 0x1f1 /* see err-bits */
-#define HD_NSECTOR 0x1f2 /* nr of sectors to read/write */
-#define HD_SECTOR 0x1f3 /* starting sector */
-#define HD_LCYL 0x1f4 /* starting cylinder */
-#define HD_HCYL 0x1f5 /* high byte of starting cyl */
-#define HD_CURRENT 0x1f6 /* 101dhhhh , d=drive, hhhh=head */
-#define HD_STATUS 0x1f7 /* see status-bits */
-#define HD_FEATURE HD_ERROR /* same io address, read=error, write=feature */
-#define HD_PRECOMP HD_FEATURE /* obsolete use of this port - predates IDE */
-#define HD_COMMAND HD_STATUS /* same io address, read=status, write=cmd */
-
-#define HD_CMD 0x3f6 /* used for resets */
-#define HD_ALTSTATUS 0x3f6 /* same as HD_STATUS but doesn't clear irq */
-
-/* Bits of HD_STATUS */
-#define ERR_STAT 0x01
-#define INDEX_STAT 0x02
-#define ECC_STAT 0x04 /* Corrected error */
-#define DRQ_STAT 0x08
-#define SEEK_STAT 0x10
-#define SERVICE_STAT SEEK_STAT
-#define WRERR_STAT 0x20
-#define READY_STAT 0x40
-#define BUSY_STAT 0x80
-
-/* Bits for HD_ERROR */
-#define MARK_ERR 0x01 /* Bad address mark */
-#define TRK0_ERR 0x02 /* couldn't find track 0 */
-#define ABRT_ERR 0x04 /* Command aborted */
-#define MCR_ERR 0x08 /* media change request */
-#define ID_ERR 0x10 /* ID field not found */
-#define MC_ERR 0x20 /* media changed */
-#define ECC_ERR 0x40 /* Uncorrectable ECC error */
-#define BBD_ERR 0x80 /* pre-EIDE meaning: block marked bad */
-#define ICRC_ERR 0x80 /* new meaning: CRC error during transfer */
-
-static DEFINE_SPINLOCK(hd_lock);
-static struct request_queue *hd_queue;
-static struct request *hd_req;
-
-#define TIMEOUT_VALUE (6*HZ)
-#define HD_DELAY 0
-
-#define MAX_ERRORS 16 /* Max read/write errors/sector */
-#define RESET_FREQ 8 /* Reset controller every 8th retry */
-#define RECAL_FREQ 4 /* Recalibrate every 4th retry */
-#define MAX_HD 2
-
-#define STAT_OK (READY_STAT|SEEK_STAT)
-#define OK_STATUS(s) (((s)&(STAT_OK|(BUSY_STAT|WRERR_STAT|ERR_STAT)))==STAT_OK)
-
-static void recal_intr(void);
-static void bad_rw_intr(void);
-
-static int reset;
-static int hd_error;
-
-/*
- * This struct defines the HD's and their types.
- */
-struct hd_i_struct {
- unsigned int head, sect, cyl, wpcom, lzone, ctl;
- int unit;
- int recalibrate;
- int special_op;
-};
-
-#ifdef HD_TYPE
-static struct hd_i_struct hd_info[] = { HD_TYPE };
-static int NR_HD = ARRAY_SIZE(hd_info);
-#else
-static struct hd_i_struct hd_info[MAX_HD];
-static int NR_HD;
-#endif
-
-static struct gendisk *hd_gendisk[MAX_HD];
-
-static struct timer_list device_timer;
-
-#define TIMEOUT_VALUE (6*HZ)
-
-#define SET_TIMER \
- do { \
- mod_timer(&device_timer, jiffies + TIMEOUT_VALUE); \
- } while (0)
-
-static void (*do_hd)(void) = NULL;
-#define SET_HANDLER(x) \
-if ((do_hd = (x)) != NULL) \
- SET_TIMER; \
-else \
- del_timer(&device_timer);
-
-
-#if (HD_DELAY > 0)
-
-#include <linux/i8253.h>
-
-unsigned long last_req;
-
-unsigned long read_timer(void)
-{
- unsigned long t, flags;
- int i;
-
- raw_spin_lock_irqsave(&i8253_lock, flags);
- t = jiffies * 11932;
- outb_p(0, 0x43);
- i = inb_p(0x40);
- i |= inb(0x40) << 8;
- raw_spin_unlock_irqrestore(&i8253_lock, flags);
- return(t - i);
-}
-#endif
-
-static void __init hd_setup(char *str, int *ints)
-{
- int hdind = 0;
-
- if (ints[0] != 3)
- return;
- if (hd_info[0].head != 0)
- hdind = 1;
- hd_info[hdind].head = ints[2];
- hd_info[hdind].sect = ints[3];
- hd_info[hdind].cyl = ints[1];
- hd_info[hdind].wpcom = 0;
- hd_info[hdind].lzone = ints[1];
- hd_info[hdind].ctl = (ints[2] > 8 ? 8 : 0);
- NR_HD = hdind+1;
-}
-
-static bool hd_end_request(int err, unsigned int bytes)
-{
- if (__blk_end_request(hd_req, err, bytes))
- return true;
- hd_req = NULL;
- return false;
-}
-
-static bool hd_end_request_cur(int err)
-{
- return hd_end_request(err, blk_rq_cur_bytes(hd_req));
-}
-
-static void dump_status(const char *msg, unsigned int stat)
-{
- char *name = "hd?";
- if (hd_req)
- name = hd_req->rq_disk->disk_name;
-
-#ifdef VERBOSE_ERRORS
- printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
- if (stat & BUSY_STAT) printk("Busy ");
- if (stat & READY_STAT) printk("DriveReady ");
- if (stat & WRERR_STAT) printk("WriteFault ");
- if (stat & SEEK_STAT) printk("SeekComplete ");
- if (stat & DRQ_STAT) printk("DataRequest ");
- if (stat & ECC_STAT) printk("CorrectedError ");
- if (stat & INDEX_STAT) printk("Index ");
- if (stat & ERR_STAT) printk("Error ");
- printk("}\n");
- if ((stat & ERR_STAT) == 0) {
- hd_error = 0;
- } else {
- hd_error = inb(HD_ERROR);
- printk("%s: %s: error=0x%02x { ", name, msg, hd_error & 0xff);
- if (hd_error & BBD_ERR) printk("BadSector ");
- if (hd_error & ECC_ERR) printk("UncorrectableError ");
- if (hd_error & ID_ERR) printk("SectorIdNotFound ");
- if (hd_error & ABRT_ERR) printk("DriveStatusError ");
- if (hd_error & TRK0_ERR) printk("TrackZeroNotFound ");
- if (hd_error & MARK_ERR) printk("AddrMarkNotFound ");
- printk("}");
- if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) {
- printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
- inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
- if (hd_req)
- printk(", sector=%ld", blk_rq_pos(hd_req));
- }
- printk("\n");
- }
-#else
- printk("%s: %s: status=0x%02x.\n", name, msg, stat & 0xff);
- if ((stat & ERR_STAT) == 0) {
- hd_error = 0;
- } else {
- hd_error = inb(HD_ERROR);
- printk("%s: %s: error=0x%02x.\n", name, msg, hd_error & 0xff);
- }
-#endif
-}
-
-static void check_status(void)
-{
- int i = inb_p(HD_STATUS);
-
- if (!OK_STATUS(i)) {
- dump_status("check_status", i);
- bad_rw_intr();
- }
-}
-
-static int controller_busy(void)
-{
- int retries = 100000;
- unsigned char status;
-
- do {
- status = inb_p(HD_STATUS);
- } while ((status & BUSY_STAT) && --retries);
- return status;
-}
-
-static int status_ok(void)
-{
- unsigned char status = inb_p(HD_STATUS);
-
- if (status & BUSY_STAT)
- return 1; /* Ancient, but does it make sense??? */
- if (status & WRERR_STAT)
- return 0;
- if (!(status & READY_STAT))
- return 0;
- if (!(status & SEEK_STAT))
- return 0;
- return 1;
-}
-
-static int controller_ready(unsigned int drive, unsigned int head)
-{
- int retry = 100;
-
- do {
- if (controller_busy() & BUSY_STAT)
- return 0;
- outb_p(0xA0 | (drive<<4) | head, HD_CURRENT);
- if (status_ok())
- return 1;
- } while (--retry);
- return 0;
-}
-
-static void hd_out(struct hd_i_struct *disk,
- unsigned int nsect,
- unsigned int sect,
- unsigned int head,
- unsigned int cyl,
- unsigned int cmd,
- void (*intr_addr)(void))
-{
- unsigned short port;
-
-#if (HD_DELAY > 0)
- while (read_timer() - last_req < HD_DELAY)
- /* nothing */;
-#endif
- if (reset)
- return;
- if (!controller_ready(disk->unit, head)) {
- reset = 1;
- return;
- }
- SET_HANDLER(intr_addr);
- outb_p(disk->ctl, HD_CMD);
- port = HD_DATA;
- outb_p(disk->wpcom >> 2, ++port);
- outb_p(nsect, ++port);
- outb_p(sect, ++port);
- outb_p(cyl, ++port);
- outb_p(cyl >> 8, ++port);
- outb_p(0xA0 | (disk->unit << 4) | head, ++port);
- outb_p(cmd, ++port);
-}
-
-static void hd_request (void);
-
-static int drive_busy(void)
-{
- unsigned int i;
- unsigned char c;
-
- for (i = 0; i < 500000 ; i++) {
- c = inb_p(HD_STATUS);
- if ((c & (BUSY_STAT | READY_STAT | SEEK_STAT)) == STAT_OK)
- return 0;
- }
- dump_status("reset timed out", c);
- return 1;
-}
-
-static void reset_controller(void)
-{
- int i;
-
- outb_p(4, HD_CMD);
- for (i = 0; i < 1000; i++) barrier();
- outb_p(hd_info[0].ctl & 0x0f, HD_CMD);
- for (i = 0; i < 1000; i++) barrier();
- if (drive_busy())
- printk("hd: controller still busy\n");
- else if ((hd_error = inb(HD_ERROR)) != 1)
- printk("hd: controller reset failed: %02x\n", hd_error);
-}
-
-static void reset_hd(void)
-{
- static int i;
-
-repeat:
- if (reset) {
- reset = 0;
- i = -1;
- reset_controller();
- } else {
- check_status();
- if (reset)
- goto repeat;
- }
- if (++i < NR_HD) {
- struct hd_i_struct *disk = &hd_info[i];
- disk->special_op = disk->recalibrate = 1;
- hd_out(disk, disk->sect, disk->sect, disk->head-1,
- disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd);
- if (reset)
- goto repeat;
- } else
- hd_request();
-}
-
-/*
- * Ok, don't know what to do with the unexpected interrupts: on some machines
- * doing a reset and a retry seems to result in an eternal loop. Right now I
- * ignore it, and just set the timeout.
- *
- * On laptops (and "green" PCs), an unexpected interrupt occurs whenever the
- * drive enters "idle", "standby", or "sleep" mode, so if the status looks
- * "good", we just ignore the interrupt completely.
- */
-static void unexpected_hd_interrupt(void)
-{
- unsigned int stat = inb_p(HD_STATUS);
-
- if (stat & (BUSY_STAT|DRQ_STAT|ECC_STAT|ERR_STAT)) {
- dump_status("unexpected interrupt", stat);
- SET_TIMER;
- }
-}
-
-/*
- * bad_rw_intr() now tries to be a bit smarter and does things
- * according to the error returned by the controller.
- * -Mika Liljeberg (liljeber@cs.Helsinki.FI)
- */
-static void bad_rw_intr(void)
-{
- struct request *req = hd_req;
-
- if (req != NULL) {
- struct hd_i_struct *disk = req->rq_disk->private_data;
- if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
- hd_end_request_cur(-EIO);
- disk->special_op = disk->recalibrate = 1;
- } else if (req->errors % RESET_FREQ == 0)
- reset = 1;
- else if ((hd_error & TRK0_ERR) || req->errors % RECAL_FREQ == 0)
- disk->special_op = disk->recalibrate = 1;
- /* Otherwise just retry */
- }
-}
-
-static inline int wait_DRQ(void)
-{
- int retries;
- int stat;
-
- for (retries = 0; retries < 100000; retries++) {
- stat = inb_p(HD_STATUS);
- if (stat & DRQ_STAT)
- return 0;
- }
- dump_status("wait_DRQ", stat);
- return -1;
-}
-
-static void read_intr(void)
-{
- struct request *req;
- int i, retries = 100000;
-
- do {
- i = (unsigned) inb_p(HD_STATUS);
- if (i & BUSY_STAT)
- continue;
- if (!OK_STATUS(i))
- break;
- if (i & DRQ_STAT)
- goto ok_to_read;
- } while (--retries > 0);
- dump_status("read_intr", i);
- bad_rw_intr();
- hd_request();
- return;
-
-ok_to_read:
- req = hd_req;
- insw(HD_DATA, bio_data(req->bio), 256);
-#ifdef DEBUG
- printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
- req->rq_disk->disk_name, blk_rq_pos(req) + 1,
- blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
-#endif
- if (hd_end_request(0, 512)) {
- SET_HANDLER(&read_intr);
- return;
- }
-
- (void) inb_p(HD_STATUS);
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-static void write_intr(void)
-{
- struct request *req = hd_req;
- int i;
- int retries = 100000;
-
- do {
- i = (unsigned) inb_p(HD_STATUS);
- if (i & BUSY_STAT)
- continue;
- if (!OK_STATUS(i))
- break;
- if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT))
- goto ok_to_write;
- } while (--retries > 0);
- dump_status("write_intr", i);
- bad_rw_intr();
- hd_request();
- return;
-
-ok_to_write:
- if (hd_end_request(0, 512)) {
- SET_HANDLER(&write_intr);
- outsw(HD_DATA, bio_data(req->bio), 256);
- return;
- }
-
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-static void recal_intr(void)
-{
- check_status();
-#if (HD_DELAY > 0)
- last_req = read_timer();
-#endif
- hd_request();
-}
-
-/*
- * This is another of the error-routines I don't know what to do with. The
- * best idea seems to just set reset, and start all over again.
- */
-static void hd_times_out(unsigned long dummy)
-{
- char *name;
-
- do_hd = NULL;
-
- if (!hd_req)
- return;
-
- spin_lock_irq(hd_queue->queue_lock);
- reset = 1;
- name = hd_req->rq_disk->disk_name;
- printk("%s: timeout\n", name);
- if (++hd_req->errors >= MAX_ERRORS) {
-#ifdef DEBUG
- printk("%s: too many errors\n", name);
-#endif
- hd_end_request_cur(-EIO);
- }
- hd_request();
- spin_unlock_irq(hd_queue->queue_lock);
-}
-
-static int do_special_op(struct hd_i_struct *disk, struct request *req)
-{
- if (disk->recalibrate) {
- disk->recalibrate = 0;
- hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr);
- return reset;
- }
- if (disk->head > 16) {
- printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
- hd_end_request_cur(-EIO);
- }
- disk->special_op = 0;
- return 1;
-}
-
-/*
- * The driver enables interrupts as much as possible. In order to do this,
- * (a) the device-interrupt is disabled before entering hd_request(),
- * and (b) the timeout-interrupt is disabled before the sti().
- *
- * Interrupts are still masked (by default) whenever we are exchanging
- * data/cmds with a drive, because some drives seem to have very poor
- * tolerance for latency during I/O. The IDE driver has support to unmask
- * interrupts for non-broken hardware, so use that driver if required.
- */
-static void hd_request(void)
-{
- unsigned int block, nsect, sec, track, head, cyl;
- struct hd_i_struct *disk;
- struct request *req;
-
- if (do_hd)
- return;
-repeat:
- del_timer(&device_timer);
-
- if (!hd_req) {
- hd_req = blk_fetch_request(hd_queue);
- if (!hd_req) {
- do_hd = NULL;
- return;
- }
- }
- req = hd_req;
-
- if (reset) {
- reset_hd();
- return;
- }
- disk = req->rq_disk->private_data;
- block = blk_rq_pos(req);
- nsect = blk_rq_sectors(req);
- if (block >= get_capacity(req->rq_disk) ||
- ((block+nsect) > get_capacity(req->rq_disk))) {
- printk("%s: bad access: block=%d, count=%d\n",
- req->rq_disk->disk_name, block, nsect);
- hd_end_request_cur(-EIO);
- goto repeat;
- }
-
- if (disk->special_op) {
- if (do_special_op(disk, req))
- goto repeat;
- return;
- }
- sec = block % disk->sect + 1;
- track = block / disk->sect;
- head = track % disk->head;
- cyl = track / disk->head;
-#ifdef DEBUG
- printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
- req->rq_disk->disk_name,
- req_data_dir(req) == READ ? "read" : "writ",
- cyl, head, sec, nsect, bio_data(req->bio));
-#endif
-
- switch (req_op(req)) {
- case REQ_OP_READ:
- hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
- &read_intr);
- if (reset)
- goto repeat;
- break;
- case REQ_OP_WRITE:
- hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
- &write_intr);
- if (reset)
- goto repeat;
- if (wait_DRQ()) {
- bad_rw_intr();
- goto repeat;
- }
- outsw(HD_DATA, bio_data(req->bio), 256);
- break;
- default:
- printk("unknown hd-command\n");
- hd_end_request_cur(-EIO);
- break;
- }
-}
-
-static void do_hd_request(struct request_queue *q)
-{
- hd_request();
-}
-
-static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct hd_i_struct *disk = bdev->bd_disk->private_data;
-
- geo->heads = disk->head;
- geo->sectors = disk->sect;
- geo->cylinders = disk->cyl;
- return 0;
-}
-
-/*
- * Releasing a block device means we sync() it, so that it can safely
- * be forgotten about...
- */
-
-static irqreturn_t hd_interrupt(int irq, void *dev_id)
-{
- void (*handler)(void) = do_hd;
-
- spin_lock(hd_queue->queue_lock);
-
- do_hd = NULL;
- del_timer(&device_timer);
- if (!handler)
- handler = unexpected_hd_interrupt;
- handler();
-
- spin_unlock(hd_queue->queue_lock);
-
- return IRQ_HANDLED;
-}
-
-static const struct block_device_operations hd_fops = {
- .getgeo = hd_getgeo,
-};
-
-static int __init hd_init(void)
-{
- int drive;
-
- if (register_blkdev(HD_MAJOR, "hd"))
- return -1;
-
- hd_queue = blk_init_queue(do_hd_request, &hd_lock);
- if (!hd_queue) {
- unregister_blkdev(HD_MAJOR, "hd");
- return -ENOMEM;
- }
-
- blk_queue_max_hw_sectors(hd_queue, 255);
- init_timer(&device_timer);
- device_timer.function = hd_times_out;
- blk_queue_logical_block_size(hd_queue, 512);
-
- if (!NR_HD) {
- /*
- * We don't know anything about the drive. This means
- * that you *MUST* specify the drive parameters to the
- * kernel yourself.
- *
- * If we were on an i386, we used to read this info from
- * the BIOS or CMOS. This doesn't work all that well,
- * since this assumes that this is a primary or secondary
- * drive, and if we're using this legacy driver, it's
- * probably an auxiliary controller added to recover
- * legacy data off an ST-506 drive. Either way, it's
- * definitely safest to have the user explicitly specify
- * the information.
- */
- printk("hd: no drives specified - use hd=cyl,head,sectors"
- " on kernel command line\n");
- goto out;
- }
-
- for (drive = 0 ; drive < NR_HD ; drive++) {
- struct gendisk *disk = alloc_disk(64);
- struct hd_i_struct *p = &hd_info[drive];
- if (!disk)
- goto Enomem;
- disk->major = HD_MAJOR;
- disk->first_minor = drive << 6;
- disk->fops = &hd_fops;
- sprintf(disk->disk_name, "hd%c", 'a'+drive);
- disk->private_data = p;
- set_capacity(disk, p->head * p->sect * p->cyl);
- disk->queue = hd_queue;
- p->unit = drive;
- hd_gendisk[drive] = disk;
- printk("%s: %luMB, CHS=%d/%d/%d\n",
- disk->disk_name, (unsigned long)get_capacity(disk)/2048,
- p->cyl, p->head, p->sect);
- }
-
- if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) {
- printk("hd: unable to get IRQ%d for the hard disk driver\n",
- HD_IRQ);
- goto out1;
- }
- if (!request_region(HD_DATA, 8, "hd")) {
- printk(KERN_WARNING "hd: port 0x%x busy\n", HD_DATA);
- goto out2;
- }
- if (!request_region(HD_CMD, 1, "hd(cmd)")) {
- printk(KERN_WARNING "hd: port 0x%x busy\n", HD_CMD);
- goto out3;
- }
-
- /* Let them fly */
- for (drive = 0; drive < NR_HD; drive++)
- add_disk(hd_gendisk[drive]);
-
- return 0;
-
-out3:
- release_region(HD_DATA, 8);
-out2:
- free_irq(HD_IRQ, NULL);
-out1:
- for (drive = 0; drive < NR_HD; drive++)
- put_disk(hd_gendisk[drive]);
- NR_HD = 0;
-out:
- del_timer(&device_timer);
- unregister_blkdev(HD_MAJOR, "hd");
- blk_cleanup_queue(hd_queue);
- return -1;
-Enomem:
- while (drive--)
- put_disk(hd_gendisk[drive]);
- goto out;
-}
-
-static int __init parse_hd_setup(char *line)
-{
- int ints[6];
-
- (void) get_options(line, ARRAY_SIZE(ints), ints);
- hd_setup(NULL, ints);
-
- return 1;
-}
-__setup("hd=", parse_hd_setup);
-
-late_initcall(hd_init);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0ecb6461ed81e..994403efee19d 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -445,32 +445,27 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
return ret;
}
-static inline void handle_partial_read(struct loop_cmd *cmd, long bytes)
+static void lo_complete_rq(struct request *rq)
{
- if (bytes < 0 || op_is_write(req_op(cmd->rq)))
- return;
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
- if (unlikely(bytes < blk_rq_bytes(cmd->rq))) {
+ if (unlikely(req_op(cmd->rq) == REQ_OP_READ && cmd->use_aio &&
+ cmd->ret >= 0 && cmd->ret < blk_rq_bytes(cmd->rq))) {
struct bio *bio = cmd->rq->bio;
- bio_advance(bio, bytes);
+ bio_advance(bio, cmd->ret);
zero_fill_bio(bio);
}
+
+ blk_mq_end_request(rq, cmd->ret < 0 ? -EIO : 0);
}
static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
{
struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
- struct request *rq = cmd->rq;
-
- handle_partial_read(cmd, ret);
- if (ret > 0)
- ret = 0;
- else if (ret < 0)
- ret = -EIO;
-
- blk_mq_complete_request(rq, ret);
+ cmd->ret = ret;
+ blk_mq_complete_request(cmd->rq);
}
static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -528,6 +523,7 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
case REQ_OP_FLUSH:
return lo_req_flush(lo, rq);
case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
return lo_discard(lo, rq, pos);
case REQ_OP_WRITE:
if (lo->transfer)
@@ -826,7 +822,7 @@ static void loop_config_discard(struct loop_device *lo)
q->limits.discard_granularity = 0;
q->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(q, 0);
- q->limits.discard_zeroes_data = 0;
+ blk_queue_max_write_zeroes_sectors(q, 0);
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
return;
}
@@ -834,7 +830,7 @@ static void loop_config_discard(struct loop_device *lo)
q->limits.discard_granularity = inode->i_sb->s_blocksize;
q->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
- q->limits.discard_zeroes_data = 1;
+ blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
@@ -1660,6 +1656,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
switch (req_op(cmd->rq)) {
case REQ_OP_FLUSH:
case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
cmd->use_aio = false;
break;
default:
@@ -1686,8 +1683,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
ret = do_req_filebacked(lo, cmd->rq);
failed:
/* complete non-aio request */
- if (!cmd->use_aio || ret)
- blk_mq_complete_request(cmd->rq, ret ? -EIO : 0);
+ if (!cmd->use_aio || ret) {
+ cmd->ret = ret ? -EIO : 0;
+ blk_mq_complete_request(cmd->rq);
+ }
}
static void loop_queue_work(struct kthread_work *work)
@@ -1710,9 +1709,10 @@ static int loop_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops loop_mq_ops = {
+static const struct blk_mq_ops loop_mq_ops = {
.queue_rq = loop_queue_rq,
.init_request = loop_init_request,
+ .complete = lo_complete_rq,
};
static int loop_add(struct loop_device **l, int i)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index fb2237c73e618..fecd3f97ef8c7 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -70,6 +70,7 @@ struct loop_cmd {
struct request *rq;
struct list_head list;
bool use_aio; /* use AIO interface to handle I/O */
+ long ret;
struct kiocb iocb;
};
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
deleted file mode 100644
index 286f276f586e4..0000000000000
--- a/drivers/block/mg_disk.c
+++ /dev/null
@@ -1,1112 +0,0 @@
-/*
- * drivers/block/mg_disk.c
- *
- * Support for the mGine m[g]flash IO mode.
- * Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/hdreg.h>
-#include <linux/ata.h>
-#include <linux/interrupt.h>
-#include <linux/delay.h>
-#include <linux/platform_device.h>
-#include <linux/gpio.h>
-#include <linux/mg_disk.h>
-#include <linux/slab.h>
-
-#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET 0x8000
-#define MG_REG_OFFSET 0xC000
-#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */
-#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */
-#define MG_REG_SECT_CNT (MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM (MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW (MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH (MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD (MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND (MG_REG_OFFSET + 0xE) /* write case */
-#define MG_REG_STATUS (MG_REG_OFFSET + 0xE) /* read case */
-#define MG_REG_DRV_CTRL (MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL (MG_REG_OFFSET + 0x12)
-
-/* handy status */
-#define MG_STAT_READY (ATA_DRDY | ATA_DSC)
-#define MG_READY_OK(s) (((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
- ATA_ERR))) == MG_STAT_READY)
-
-/* error code for others */
-#define MG_ERR_NONE 0
-#define MG_ERR_TIMEOUT 0x100
-#define MG_ERR_INIT_STAT 0x101
-#define MG_ERR_TRANSLATION 0x102
-#define MG_ERR_CTRL_RST 0x103
-#define MG_ERR_INV_STAT 0x104
-#define MG_ERR_RSTOUT 0x105
-
-#define MG_MAX_ERRORS 6 /* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD 1
-#define MG_TMAX_WAIT_RD_DRQ 10
-#define MG_TMAX_WAIT_WR_DRQ 500
-#define MG_TMAX_RST_TO_BUSY 10
-#define MG_TMAX_HDRST_TO_RDY 500
-#define MG_TMAX_SWRST_TO_RDY 500
-#define MG_TMAX_RSTOUT 3000
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* main structure for mflash driver */
-struct mg_host {
- struct device *dev;
-
- struct request_queue *breq;
- struct request *req;
- spinlock_t lock;
- struct gendisk *gd;
-
- struct timer_list timer;
- void (*mg_do_intr) (struct mg_host *);
-
- u16 id[ATA_ID_WORDS];
-
- u16 cyls;
- u16 heads;
- u16 sectors;
- u32 n_sectors;
- u32 nres_sectors;
-
- void __iomem *dev_base;
- unsigned int irq;
- unsigned int rst;
- unsigned int rstout;
-
- u32 major;
- u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-# define MG_DBG(fmt, args...) \
- printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-# define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-static void mg_request(struct request_queue *);
-
-static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
-{
- if (__blk_end_request(host->req, err, nr_bytes))
- return true;
-
- host->req = NULL;
- return false;
-}
-
-static bool mg_end_request_cur(struct mg_host *host, int err)
-{
- return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
-}
-
-static void mg_dump_status(const char *msg, unsigned int stat,
- struct mg_host *host)
-{
- char *name = MG_DISK_NAME;
-
- if (host->req)
- name = host->req->rq_disk->disk_name;
-
- printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
- if (stat & ATA_BUSY)
- printk("Busy ");
- if (stat & ATA_DRDY)
- printk("DriveReady ");
- if (stat & ATA_DF)
- printk("WriteFault ");
- if (stat & ATA_DSC)
- printk("SeekComplete ");
- if (stat & ATA_DRQ)
- printk("DataRequest ");
- if (stat & ATA_CORR)
- printk("CorrectedError ");
- if (stat & ATA_ERR)
- printk("Error ");
- printk("}\n");
- if ((stat & ATA_ERR) == 0) {
- host->error = 0;
- } else {
- host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
- printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
- host->error & 0xff);
- if (host->error & ATA_BBK)
- printk("BadSector ");
- if (host->error & ATA_UNC)
- printk("UncorrectableError ");
- if (host->error & ATA_IDNF)
- printk("SectorIdNotFound ");
- if (host->error & ATA_ABORTED)
- printk("DriveStatusError ");
- if (host->error & ATA_AMNF)
- printk("AddrMarkNotFound ");
- printk("}");
- if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
- if (host->req)
- printk(", sector=%u",
- (unsigned int)blk_rq_pos(host->req));
- }
- printk("\n");
- }
-}
-
-static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
-{
- u8 status;
- unsigned long expire, cur_jiffies;
- struct mg_drv_data *prv_data = host->dev->platform_data;
-
- host->error = MG_ERR_NONE;
- expire = jiffies + msecs_to_jiffies(msec);
-
- /* These 2 times dummy status read prevents reading invalid
- * status. A very little time (3 times of mflash operating clk)
- * is required for busy bit is set. Use dummy read instead of
- * busy wait, because mflash's PLL is machine dependent.
- */
- if (prv_data->use_polling) {
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- }
-
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
- do {
- cur_jiffies = jiffies;
- if (status & ATA_BUSY) {
- if (expect == ATA_BUSY)
- break;
- } else {
- /* Check the error condition! */
- if (status & ATA_ERR) {
- mg_dump_status("mg_wait", status, host);
- break;
- }
-
- if (expect == MG_STAT_READY)
- if (MG_READY_OK(status))
- break;
-
- if (expect == ATA_DRQ)
- if (status & ATA_DRQ)
- break;
- }
- if (!msec) {
- mg_dump_status("not ready", status, host);
- return MG_ERR_INV_STAT;
- }
-
- status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- } while (time_before(cur_jiffies, expire));
-
- if (time_after_eq(cur_jiffies, expire) && msec)
- host->error = MG_ERR_TIMEOUT;
-
- return host->error;
-}
-
-static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
-{
- unsigned long expire;
-
- expire = jiffies + msecs_to_jiffies(msec);
- while (time_before(jiffies, expire)) {
- if (gpio_get_value(rstout) == 1)
- return MG_ERR_NONE;
- msleep(10);
- }
-
- return MG_ERR_RSTOUT;
-}
-
-static void mg_unexpected_intr(struct mg_host *host)
-{
- u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-
- mg_dump_status("mg_unexpected_intr", status, host);
-}
-
-static irqreturn_t mg_irq(int irq, void *dev_id)
-{
- struct mg_host *host = dev_id;
- void (*handler)(struct mg_host *) = host->mg_do_intr;
-
- spin_lock(&host->lock);
-
- host->mg_do_intr = NULL;
- del_timer(&host->timer);
- if (!handler)
- handler = mg_unexpected_intr;
- handler(host);
-
- spin_unlock(&host->lock);
-
- return IRQ_HANDLED;
-}
-
-/* local copy of ata_id_string() */
-static void mg_id_string(const u16 *id, unsigned char *s,
- unsigned int ofs, unsigned int len)
-{
- unsigned int c;
-
- BUG_ON(len & 1);
-
- while (len > 0) {
- c = id[ofs] >> 8;
- *s = c;
- s++;
-
- c = id[ofs] & 0xff;
- *s = c;
- s++;
-
- ofs++;
- len -= 2;
- }
-}
-
-/* local copy of ata_id_c_string() */
-static void mg_id_c_string(const u16 *id, unsigned char *s,
- unsigned int ofs, unsigned int len)
-{
- unsigned char *p;
-
- mg_id_string(id, s, ofs, len - 1);
-
- p = s + strnlen(s, len - 1);
- while (p > s && p[-1] == ' ')
- p--;
- *p = '\0';
-}
-
-static int mg_get_disk_id(struct mg_host *host)
-{
- u32 i;
- s32 err;
- const u16 *id = host->id;
- struct mg_drv_data *prv_data = host->dev->platform_data;
- char fwrev[ATA_ID_FW_REV_LEN + 1];
- char model[ATA_ID_PROD_LEN + 1];
- char serial[ATA_ID_SERNO_LEN + 1];
-
- if (!prv_data->use_polling)
- outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
- err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
- if (err)
- return err;
-
- for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
- host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
- MG_BUFF_OFFSET + i * 2));
-
- outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
- err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
- if (err)
- return err;
-
- if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
- return MG_ERR_TRANSLATION;
-
- host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
- host->cyls = id[ATA_ID_CYLS];
- host->heads = id[ATA_ID_HEADS];
- host->sectors = id[ATA_ID_SECTORS];
-
- if (MG_RES_SEC && host->heads && host->sectors) {
- /* modify cyls, n_sectors */
- host->cyls = (host->n_sectors - MG_RES_SEC) /
- host->heads / host->sectors;
- host->nres_sectors = host->n_sectors - host->cyls *
- host->heads * host->sectors;
- host->n_sectors -= host->nres_sectors;
- }
-
- mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
- mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
- mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
- printk(KERN_INFO "mg_disk: model: %s\n", model);
- printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
- printk(KERN_INFO "mg_disk: serial: %s\n", serial);
- printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
- host->n_sectors, host->nres_sectors);
-
- if (!prv_data->use_polling)
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- return err;
-}
-
-
-static int mg_disk_init(struct mg_host *host)
-{
- struct mg_drv_data *prv_data = host->dev->platform_data;
- s32 err;
- u8 init_status;
-
- /* hdd rst low */
- gpio_set_value(host->rst, 0);
- err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
- if (err)
- return err;
-
- /* hdd rst high */
- gpio_set_value(host->rst, 1);
- err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
- if (err)
- return err;
-
- /* soft reset on */
- outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
- (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
- if (err)
- return err;
-
- /* soft reset off */
- outb(prv_data->use_polling ? ATA_NIEN : 0,
- (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
- if (err)
- return err;
-
- init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
-
- if (init_status == 0xf)
- return MG_ERR_INIT_STAT;
-
- return err;
-}
-
-static void mg_bad_rw_intr(struct mg_host *host)
-{
- if (host->req)
- if (++host->req->errors >= MG_MAX_ERRORS ||
- host->error == MG_ERR_TIMEOUT)
- mg_end_request_cur(host, -EIO);
-}
-
-static unsigned int mg_out(struct mg_host *host,
- unsigned int sect_num,
- unsigned int sect_cnt,
- unsigned int cmd,
- void (*intr_addr)(struct mg_host *))
-{
- struct mg_drv_data *prv_data = host->dev->platform_data;
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return host->error;
-
- if (!prv_data->use_polling) {
- host->mg_do_intr = intr_addr;
- mod_timer(&host->timer, jiffies + 3 * HZ);
- }
- if (MG_RES_SEC)
- sect_num += MG_RES_SEC;
- outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
- outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
- outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
- MG_REG_CYL_LOW);
- outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
- MG_REG_CYL_HIGH);
- outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
- (unsigned long)host->dev_base + MG_REG_DRV_HEAD);
- outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
- return MG_ERR_NONE;
-}
-
-static void mg_read_one(struct mg_host *host, struct request *req)
-{
- u16 *buff = (u16 *)bio_data(req->bio);
- u32 i;
-
- for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
- *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
- (i << 1));
-}
-
-static void mg_read(struct request *req)
-{
- struct mg_host *host = req->rq_disk->private_data;
-
- if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
- MG_CMD_RD, NULL) != MG_ERR_NONE)
- mg_bad_rw_intr(host);
-
- MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
- blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
-
- do {
- if (mg_wait(host, ATA_DRQ,
- MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
-
- mg_read_one(host, req);
-
- outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
- MG_REG_COMMAND);
- } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_write_one(struct mg_host *host, struct request *req)
-{
- u16 *buff = (u16 *)bio_data(req->bio);
- u32 i;
-
- for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
- outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
- (i << 1));
-}
-
-static void mg_write(struct request *req)
-{
- struct mg_host *host = req->rq_disk->private_data;
- unsigned int rem = blk_rq_sectors(req);
-
- if (mg_out(host, blk_rq_pos(req), rem,
- MG_CMD_WR, NULL) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
-
- MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
- rem, blk_rq_pos(req), bio_data(req->bio));
-
- if (mg_wait(host, ATA_DRQ,
- MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
-
- do {
- mg_write_one(host, req);
-
- outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
- MG_REG_COMMAND);
-
- rem--;
- if (rem > 1 && mg_wait(host, ATA_DRQ,
- MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- } else if (mg_wait(host, MG_STAT_READY,
- MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return;
- }
- } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
-}
-
-static void mg_read_intr(struct mg_host *host)
-{
- struct request *req = host->req;
- u32 i;
-
- /* check status */
- do {
- i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- if (i & ATA_BUSY)
- break;
- if (!MG_READY_OK(i))
- break;
- if (i & ATA_DRQ)
- goto ok_to_read;
- } while (0);
- mg_dump_status("mg_read_intr", i, host);
- mg_bad_rw_intr(host);
- mg_request(host->breq);
- return;
-
-ok_to_read:
- mg_read_one(host, req);
-
- MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
- blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
-
- /* send read confirm */
- outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
- if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
- /* set handler if read remains */
- host->mg_do_intr = mg_read_intr;
- mod_timer(&host->timer, jiffies + 3 * HZ);
- } else /* goto next request */
- mg_request(host->breq);
-}
-
-static void mg_write_intr(struct mg_host *host)
-{
- struct request *req = host->req;
- u32 i;
- bool rem;
-
- /* check status */
- do {
- i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
- if (i & ATA_BUSY)
- break;
- if (!MG_READY_OK(i))
- break;
- if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
- goto ok_to_write;
- } while (0);
- mg_dump_status("mg_write_intr", i, host);
- mg_bad_rw_intr(host);
- mg_request(host->breq);
- return;
-
-ok_to_write:
- if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
- /* write 1 sector and set handler if remains */
- mg_write_one(host, req);
- MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
- blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
- host->mg_do_intr = mg_write_intr;
- mod_timer(&host->timer, jiffies + 3 * HZ);
- }
-
- /* send write confirm */
- outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
-
- if (!rem)
- mg_request(host->breq);
-}
-
-static void mg_times_out(unsigned long data)
-{
- struct mg_host *host = (struct mg_host *)data;
- char *name;
-
- spin_lock_irq(&host->lock);
-
- if (!host->req)
- goto out_unlock;
-
- host->mg_do_intr = NULL;
-
- name = host->req->rq_disk->disk_name;
- printk(KERN_DEBUG "%s: timeout\n", name);
-
- host->error = MG_ERR_TIMEOUT;
- mg_bad_rw_intr(host);
-
-out_unlock:
- mg_request(host->breq);
- spin_unlock_irq(&host->lock);
-}
-
-static void mg_request_poll(struct request_queue *q)
-{
- struct mg_host *host = q->queuedata;
-
- while (1) {
- if (!host->req) {
- host->req = blk_fetch_request(q);
- if (!host->req)
- break;
- }
-
- switch (req_op(host->req)) {
- case REQ_OP_READ:
- mg_read(host->req);
- break;
- case REQ_OP_WRITE:
- mg_write(host->req);
- break;
- default:
- mg_end_request_cur(host, -EIO);
- break;
- }
- }
-}
-
-static unsigned int mg_issue_req(struct request *req,
- struct mg_host *host,
- unsigned int sect_num,
- unsigned int sect_cnt)
-{
- switch (req_op(host->req)) {
- case REQ_OP_READ:
- if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
- != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return host->error;
- }
- break;
- case REQ_OP_WRITE:
- /* TODO : handler */
- outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
- != MG_ERR_NONE) {
- mg_bad_rw_intr(host);
- return host->error;
- }
- del_timer(&host->timer);
- mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- if (host->error) {
- mg_bad_rw_intr(host);
- return host->error;
- }
- mg_write_one(host, req);
- mod_timer(&host->timer, jiffies + 3 * HZ);
- outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
- MG_REG_COMMAND);
- break;
- default:
- mg_end_request_cur(host, -EIO);
- break;
- }
- return MG_ERR_NONE;
-}
-
-/* This function also called from IRQ context */
-static void mg_request(struct request_queue *q)
-{
- struct mg_host *host = q->queuedata;
- struct request *req;
- u32 sect_num, sect_cnt;
-
- while (1) {
- if (!host->req) {
- host->req = blk_fetch_request(q);
- if (!host->req)
- break;
- }
- req = host->req;
-
- /* check unwanted request call */
- if (host->mg_do_intr)
- return;
-
- del_timer(&host->timer);
-
- sect_num = blk_rq_pos(req);
- /* deal whole segments */
- sect_cnt = blk_rq_sectors(req);
-
- /* sanity check */
- if (sect_num >= get_capacity(req->rq_disk) ||
- ((sect_num + sect_cnt) >
- get_capacity(req->rq_disk))) {
- printk(KERN_WARNING
- "%s: bad access: sector=%d, count=%d\n",
- req->rq_disk->disk_name,
- sect_num, sect_cnt);
- mg_end_request_cur(host, -EIO);
- continue;
- }
-
- if (!mg_issue_req(req, host, sect_num, sect_cnt))
- return;
- }
-}
-
-static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
- struct mg_host *host = bdev->bd_disk->private_data;
-
- geo->cylinders = (unsigned short)host->cyls;
- geo->heads = (unsigned char)host->heads;
- geo->sectors = (unsigned char)host->sectors;
- return 0;
-}
-
-static const struct block_device_operations mg_disk_ops = {
- .getgeo = mg_getgeo
-};
-
-#ifdef CONFIG_PM_SLEEP
-static int mg_suspend(struct device *dev)
-{
- struct mg_drv_data *prv_data = dev->platform_data;
- struct mg_host *host = prv_data->host;
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return -EIO;
-
- if (!prv_data->use_polling)
- outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
- /* wait until mflash deep sleep */
- msleep(1);
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
- if (!prv_data->use_polling)
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
- return -EIO;
- }
-
- return 0;
-}
-
-static int mg_resume(struct device *dev)
-{
- struct mg_drv_data *prv_data = dev->platform_data;
- struct mg_host *host = prv_data->host;
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return -EIO;
-
- outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
- /* wait until mflash wakeup */
- msleep(1);
-
- if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
- return -EIO;
-
- if (!prv_data->use_polling)
- outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-
- return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
-
-static int mg_probe(struct platform_device *plat_dev)
-{
- struct mg_host *host;
- struct resource *rsc;
- struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
- int err = 0;
-
- if (!prv_data) {
- printk(KERN_ERR "%s:%d fail (no driver_data)\n",
- __func__, __LINE__);
- err = -EINVAL;
- goto probe_err;
- }
-
- /* alloc mg_host */
- host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
- if (!host) {
- printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
- __func__, __LINE__);
- err = -ENOMEM;
- goto probe_err;
- }
- host->major = MG_DISK_MAJ;
-
- /* link each other */
- prv_data->host = host;
- host->dev = &plat_dev->dev;
-
- /* io remap */
- rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
- if (!rsc) {
- printk(KERN_ERR "%s:%d platform_get_resource fail\n",
- __func__, __LINE__);
- err = -EINVAL;
- goto probe_err_2;
- }
- host->dev_base = ioremap(rsc->start, resource_size(rsc));
- if (!host->dev_base) {
- printk(KERN_ERR "%s:%d ioremap fail\n",
- __func__, __LINE__);
- err = -EIO;
- goto probe_err_2;
- }
- MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
-
- /* get reset pin */
- rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
- MG_RST_PIN);
- if (!rsc) {
- printk(KERN_ERR "%s:%d get reset pin fail\n",
- __func__, __LINE__);
- err = -EIO;
- goto probe_err_3;
- }
- host->rst = rsc->start;
-
- /* init rst pin */
- err = gpio_request(host->rst, MG_RST_PIN);
- if (err)
- goto probe_err_3;
- gpio_direction_output(host->rst, 1);
-
- /* reset out pin */
- if (!(prv_data->dev_attr & MG_DEV_MASK)) {
- err = -EINVAL;
- goto probe_err_3a;
- }
-
- if (prv_data->dev_attr != MG_BOOT_DEV) {
- rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
- MG_RSTOUT_PIN);
- if (!rsc) {
- printk(KERN_ERR "%s:%d get reset-out pin fail\n",
- __func__, __LINE__);
- err = -EIO;
- goto probe_err_3a;
- }
- host->rstout = rsc->start;
- err = gpio_request(host->rstout, MG_RSTOUT_PIN);
- if (err)
- goto probe_err_3a;
- gpio_direction_input(host->rstout);
- }
-
- /* disk reset */
- if (prv_data->dev_attr == MG_STORAGE_DEV) {
- /* If POR seq. not yet finished, wait */
- err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
- if (err)
- goto probe_err_3b;
- err = mg_disk_init(host);
- if (err) {
- printk(KERN_ERR "%s:%d fail (err code : %d)\n",
- __func__, __LINE__, err);
- err = -EIO;
- goto probe_err_3b;
- }
- }
-
- /* get irq resource */
- if (!prv_data->use_polling) {
- host->irq = platform_get_irq(plat_dev, 0);
- if (host->irq == -ENXIO) {
- err = host->irq;
- goto probe_err_3b;
- }
- err = request_irq(host->irq, mg_irq,
- IRQF_TRIGGER_RISING,
- MG_DEV_NAME, host);
- if (err) {
- printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
- __func__, __LINE__, err);
- goto probe_err_3b;
- }
-
- }
-
- /* get disk id */
- err = mg_get_disk_id(host);
- if (err) {
- printk(KERN_ERR "%s:%d fail (err code : %d)\n",
- __func__, __LINE__, err);
- err = -EIO;
- goto probe_err_4;
- }
-
- err = register_blkdev(host->major, MG_DISK_NAME);
- if (err < 0) {
- printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
- __func__, __LINE__, err);
- goto probe_err_4;
- }
- if (!host->major)
- host->major = err;
-
- spin_lock_init(&host->lock);
-
- if (prv_data->use_polling)
- host->breq = blk_init_queue(mg_request_poll, &host->lock);
- else
- host->breq = blk_init_queue(mg_request, &host->lock);
-
- if (!host->breq) {
- err = -ENOMEM;
- printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
- __func__, __LINE__);
- goto probe_err_5;
- }
- host->breq->queuedata = host;
-
- /* mflash is random device, thanx for the noop */
- err = elevator_change(host->breq, "noop");
- if (err) {
- printk(KERN_ERR "%s:%d (elevator_init) fail\n",
- __func__, __LINE__);
- goto probe_err_6;
- }
- blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
- blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
-
- init_timer(&host->timer);
- host->timer.function = mg_times_out;
- host->timer.data = (unsigned long)host;
-
- host->gd = alloc_disk(MG_DISK_MAX_PART);
- if (!host->gd) {
- printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
- __func__, __LINE__);
- err = -ENOMEM;
- goto probe_err_7;
- }
- host->gd->major = host->major;
- host->gd->first_minor = 0;
- host->gd->fops = &mg_disk_ops;
- host->gd->queue = host->breq;
- host->gd->private_data = host;
- sprintf(host->gd->disk_name, MG_DISK_NAME"a");
-
- set_capacity(host->gd, host->n_sectors);
-
- add_disk(host->gd);
-
- return err;
-
-probe_err_7:
- del_timer_sync(&host->timer);
-probe_err_6:
- blk_cleanup_queue(host->breq);
-probe_err_5:
- unregister_blkdev(host->major, MG_DISK_NAME);
-probe_err_4:
- if (!prv_data->use_polling)
- free_irq(host->irq, host);
-probe_err_3b:
- gpio_free(host->rstout);
-probe_err_3a:
- gpio_free(host->rst);
-probe_err_3:
- iounmap(host->dev_base);
-probe_err_2:
- kfree(host);
-probe_err:
- return err;
-}
-
-static int mg_remove(struct platform_device *plat_dev)
-{
- struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
- struct mg_host *host = prv_data->host;
- int err = 0;
-
- /* delete timer */
- del_timer_sync(&host->timer);
-
- /* remove disk */
- if (host->gd) {
- del_gendisk(host->gd);
- put_disk(host->gd);
- }
- /* remove queue */
- if (host->breq)
- blk_cleanup_queue(host->breq);
-
- /* unregister blk device */
- unregister_blkdev(host->major, MG_DISK_NAME);
-
- /* free irq */
- if (!prv_data->use_polling)
- free_irq(host->irq, host);
-
- /* free reset-out pin */
- if (prv_data->dev_attr != MG_BOOT_DEV)
- gpio_free(host->rstout);
-
- /* free rst pin */
- if (host->rst)
- gpio_free(host->rst);
-
- /* unmap io */
- if (host->dev_base)
- iounmap(host->dev_base);
-
- /* free mg_host */
- kfree(host);
-
- return err;
-}
-
-static struct platform_driver mg_disk_driver = {
- .probe = mg_probe,
- .remove = mg_remove,
- .driver = {
- .name = MG_DEV_NAME,
- .pm = &mg_pm,
- }
-};
-
-/****************************************************************************
- *
- * Module stuff
- *
- ****************************************************************************/
-
-static int __init mg_init(void)
-{
- printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
- return platform_driver_register(&mg_disk_driver);
-}
-
-static void __exit mg_exit(void)
-{
- printk(KERN_INFO "mflash driver : bye bye\n");
- platform_driver_unregister(&mg_disk_driver);
-}
-
-module_init(mg_init);
-module_exit(mg_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
-MODULE_DESCRIPTION("mGine m[g]flash device driver");
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 1d1dc11aa5fae..02804cc79d828 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -169,6 +169,25 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
return false; /* device present */
}
+/* we have to use runtime tag to setup command header */
+static void mtip_init_cmd_header(struct request *rq)
+{
+ struct driver_data *dd = rq->q->queuedata;
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+ /* Point the command headers at the command tables. */
+ cmd->command_header = dd->port->command_list +
+ (sizeof(struct mtip_cmd_hdr) * rq->tag);
+ cmd->command_header_dma = dd->port->command_list_dma +
+ (sizeof(struct mtip_cmd_hdr) * rq->tag);
+
+ if (host_cap_64)
+ cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+
+ cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+}
+
static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
{
struct request *rq;
@@ -180,6 +199,9 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
if (IS_ERR(rq))
return NULL;
+ /* Internal cmd isn't submitted via .queue_rq */
+ mtip_init_cmd_header(rq);
+
return blk_mq_rq_to_pdu(rq);
}
@@ -241,7 +263,8 @@ static void mtip_async_complete(struct mtip_port *port,
rq = mtip_rq_from_tag(dd, tag);
- blk_mq_complete_request(rq, status);
+ cmd->status = status;
+ blk_mq_complete_request(rq);
}
/*
@@ -2910,18 +2933,19 @@ static void mtip_softirq_done_fn(struct request *rq)
if (unlikely(cmd->unaligned))
up(&dd->port->cmd_slot_unal);
- blk_mq_end_request(rq, rq->errors);
+ blk_mq_end_request(rq, cmd->status);
}
static void mtip_abort_cmd(struct request *req, void *data,
bool reserved)
{
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
struct driver_data *dd = data;
dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
clear_bit(req->tag, dd->port->cmds_to_issue);
- req->errors = -EIO;
+ cmd->status = -EIO;
mtip_softirq_done_fn(req);
}
@@ -3807,6 +3831,8 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *rq = bd->rq;
int ret;
+ mtip_init_cmd_header(rq);
+
if (unlikely(mtip_check_unal_depth(hctx, rq)))
return BLK_MQ_RQ_QUEUE_BUSY;
@@ -3816,7 +3842,6 @@ static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
if (likely(!ret))
return BLK_MQ_RQ_QUEUE_OK;
- rq->errors = ret;
return BLK_MQ_RQ_QUEUE_ERROR;
}
@@ -3838,7 +3863,6 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
{
struct driver_data *dd = data;
struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
- u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
/*
* For flush requests, request_idx starts at the end of the
@@ -3855,17 +3879,6 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
memset(cmd->command, 0, CMD_DMA_ALLOC_SZ);
- /* Point the command headers at the command tables. */
- cmd->command_header = dd->port->command_list +
- (sizeof(struct mtip_cmd_hdr) * request_idx);
- cmd->command_header_dma = dd->port->command_list_dma +
- (sizeof(struct mtip_cmd_hdr) * request_idx);
-
- if (host_cap_64)
- cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
-
- cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
-
sg_init_table(cmd->sg, MTIP_MAX_SG);
return 0;
}
@@ -3889,7 +3902,7 @@ exit_handler:
return BLK_EH_RESET_TIMER;
}
-static struct blk_mq_ops mtip_mq_ops = {
+static const struct blk_mq_ops mtip_mq_ops = {
.queue_rq = mtip_queue_rq,
.init_request = mtip_init_cmd,
.exit_request = mtip_free_cmd,
@@ -4025,7 +4038,6 @@ skip_create_disk:
dd->queue->limits.discard_granularity = 4096;
blk_queue_max_discard_sectors(dd->queue,
MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
- dd->queue->limits.discard_zeroes_data = 0;
}
/* Set the capacity of the device in 512 byte sectors. */
@@ -4107,9 +4119,11 @@ static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
struct driver_data *dd = (struct driver_data *)data;
struct mtip_cmd *cmd;
- if (likely(!reserv))
- blk_mq_complete_request(rq, -ENODEV);
- else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
+ if (likely(!reserv)) {
+ cmd = blk_mq_rq_to_pdu(rq);
+ cmd->status = -ENODEV;
+ blk_mq_complete_request(rq);
+ } else if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &dd->port->flags)) {
cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
if (cmd->comp_func)
@@ -4162,7 +4176,7 @@ static int mtip_block_remove(struct driver_data *dd)
dev_info(&dd->pdev->dev, "device %s surprise removal\n",
dd->disk->disk_name);
- blk_mq_freeze_queue_start(dd->queue);
+ blk_freeze_queue_start(dd->queue);
blk_mq_stop_hw_queues(dd->queue);
blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 7617888f79449..57b41528a8248 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -352,6 +352,7 @@ struct mtip_cmd {
int retries; /* The number of retries left for this command. */
int direction; /* Data transfer direction */
+ int status;
};
/* Structure used to describe a port. */
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d8a23561b4cb4..ac376b9b852d6 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -40,49 +40,82 @@
#include <asm/types.h>
#include <linux/nbd.h>
+#include <linux/nbd-netlink.h>
+#include <net/genetlink.h>
static DEFINE_IDR(nbd_index_idr);
static DEFINE_MUTEX(nbd_index_mutex);
+static int nbd_total_devices = 0;
struct nbd_sock {
struct socket *sock;
struct mutex tx_lock;
struct request *pending;
int sent;
+ bool dead;
+ int fallback_index;
+ int cookie;
+};
+
+struct recv_thread_args {
+ struct work_struct work;
+ struct nbd_device *nbd;
+ int index;
+};
+
+struct link_dead_args {
+ struct work_struct work;
+ int index;
};
#define NBD_TIMEDOUT 0
#define NBD_DISCONNECT_REQUESTED 1
#define NBD_DISCONNECTED 2
-#define NBD_RUNNING 3
+#define NBD_HAS_PID_FILE 3
+#define NBD_HAS_CONFIG_REF 4
+#define NBD_BOUND 5
+#define NBD_DESTROY_ON_DISCONNECT 6
-struct nbd_device {
+struct nbd_config {
u32 flags;
unsigned long runtime_flags;
- struct nbd_sock **socks;
- int magic;
+ u64 dead_conn_timeout;
- struct blk_mq_tag_set tag_set;
-
- struct mutex config_lock;
- struct gendisk *disk;
+ struct nbd_sock **socks;
int num_connections;
+ atomic_t live_connections;
+ wait_queue_head_t conn_wait;
+
atomic_t recv_threads;
wait_queue_head_t recv_wq;
loff_t blksize;
loff_t bytesize;
-
- struct task_struct *task_recv;
- struct task_struct *task_setup;
-
#if IS_ENABLED(CONFIG_DEBUG_FS)
struct dentry *dbg_dir;
#endif
};
+struct nbd_device {
+ struct blk_mq_tag_set tag_set;
+
+ int index;
+ refcount_t config_refs;
+ refcount_t refs;
+ struct nbd_config *config;
+ struct mutex config_lock;
+ struct gendisk *disk;
+
+ struct list_head list;
+ struct task_struct *task_recv;
+ struct task_struct *task_setup;
+};
+
struct nbd_cmd {
struct nbd_device *nbd;
+ int index;
+ int cookie;
struct completion send_complete;
+ int status;
};
#if IS_ENABLED(CONFIG_DEBUG_FS)
@@ -100,18 +133,16 @@ static int part_shift;
static int nbd_dev_dbg_init(struct nbd_device *nbd);
static void nbd_dev_dbg_close(struct nbd_device *nbd);
-
+static void nbd_config_put(struct nbd_device *nbd);
+static void nbd_connect_reply(struct genl_info *info, int index);
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
+static void nbd_dead_link_work(struct work_struct *work);
static inline struct device *nbd_to_dev(struct nbd_device *nbd)
{
return disk_to_dev(nbd->disk);
}
-static bool nbd_is_connected(struct nbd_device *nbd)
-{
- return !!nbd->task_recv;
-}
-
static const char *nbdcmd_to_ascii(int cmd)
{
switch (cmd) {
@@ -124,44 +155,104 @@ static const char *nbdcmd_to_ascii(int cmd)
return "invalid";
}
-static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
+static ssize_t pid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- if (bdev->bd_openers <= 1)
- bd_set_size(bdev, 0);
- set_capacity(nbd->disk, 0);
- kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
- return 0;
+ return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
+}
+
+static struct device_attribute pid_attr = {
+ .attr = { .name = "pid", .mode = S_IRUGO},
+ .show = pid_show,
+};
+
+static void nbd_dev_remove(struct nbd_device *nbd)
+{
+ struct gendisk *disk = nbd->disk;
+ if (disk) {
+ del_gendisk(disk);
+ blk_cleanup_queue(disk->queue);
+ blk_mq_free_tag_set(&nbd->tag_set);
+ disk->private_data = NULL;
+ put_disk(disk);
+ }
+ kfree(nbd);
+}
+
+static void nbd_put(struct nbd_device *nbd)
+{
+ if (refcount_dec_and_mutex_lock(&nbd->refs,
+ &nbd_index_mutex)) {
+ idr_remove(&nbd_index_idr, nbd->index);
+ mutex_unlock(&nbd_index_mutex);
+ nbd_dev_remove(nbd);
+ }
+}
+
+static int nbd_disconnected(struct nbd_config *config)
+{
+ return test_bit(NBD_DISCONNECTED, &config->runtime_flags) ||
+ test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags);
+}
+
+static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
+ int notify)
+{
+ if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
+ struct link_dead_args *args;
+ args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
+ if (args) {
+ INIT_WORK(&args->work, nbd_dead_link_work);
+ args->index = nbd->index;
+ queue_work(system_wq, &args->work);
+ }
+ }
+ if (!nsock->dead) {
+ kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+ atomic_dec(&nbd->config->live_connections);
+ }
+ nsock->dead = true;
+ nsock->pending = NULL;
+ nsock->sent = 0;
+}
+
+static void nbd_size_clear(struct nbd_device *nbd)
+{
+ if (nbd->config->bytesize) {
+ set_capacity(nbd->disk, 0);
+ kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+ }
}
-static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_size_update(struct nbd_device *nbd)
{
- blk_queue_logical_block_size(nbd->disk->queue, nbd->blksize);
- blk_queue_physical_block_size(nbd->disk->queue, nbd->blksize);
- bd_set_size(bdev, nbd->bytesize);
- set_capacity(nbd->disk, nbd->bytesize >> 9);
+ struct nbd_config *config = nbd->config;
+ blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
+ blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
+ set_capacity(nbd->disk, config->bytesize >> 9);
kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
}
-static void nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
- loff_t blocksize, loff_t nr_blocks)
+static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
+ loff_t nr_blocks)
{
- nbd->blksize = blocksize;
- nbd->bytesize = blocksize * nr_blocks;
- if (nbd_is_connected(nbd))
- nbd_size_update(nbd, bdev);
+ struct nbd_config *config = nbd->config;
+ config->blksize = blocksize;
+ config->bytesize = blocksize * nr_blocks;
+ nbd_size_update(nbd);
}
-static void nbd_end_request(struct nbd_cmd *cmd)
+static void nbd_complete_rq(struct request *req)
{
- struct nbd_device *nbd = cmd->nbd;
- struct request *req = blk_mq_rq_from_pdu(cmd);
- int error = req->errors ? -EIO : 0;
+ struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
- dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd,
- error ? "failed" : "done");
+ dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", cmd,
+ cmd->status ? "failed" : "done");
- blk_mq_complete_request(req, error);
+ blk_mq_end_request(req, cmd->status);
}
/*
@@ -169,17 +260,18 @@ static void nbd_end_request(struct nbd_cmd *cmd)
*/
static void sock_shutdown(struct nbd_device *nbd)
{
+ struct nbd_config *config = nbd->config;
int i;
- if (nbd->num_connections == 0)
+ if (config->num_connections == 0)
return;
- if (test_and_set_bit(NBD_DISCONNECTED, &nbd->runtime_flags))
+ if (test_and_set_bit(NBD_DISCONNECTED, &config->runtime_flags))
return;
- for (i = 0; i < nbd->num_connections; i++) {
- struct nbd_sock *nsock = nbd->socks[i];
+ for (i = 0; i < config->num_connections; i++) {
+ struct nbd_sock *nsock = config->socks[i];
mutex_lock(&nsock->tx_lock);
- kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+ nbd_mark_nsock_dead(nbd, nsock, 0);
mutex_unlock(&nsock->tx_lock);
}
dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
@@ -190,14 +282,58 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
struct nbd_device *nbd = cmd->nbd;
+ struct nbd_config *config;
- dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
- set_bit(NBD_TIMEDOUT, &nbd->runtime_flags);
- req->errors = -EIO;
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ cmd->status = -EIO;
+ return BLK_EH_HANDLED;
+ }
- mutex_lock(&nbd->config_lock);
+ /* If we are waiting on our dead timer then we could get timeout
+ * callbacks for our request. For this we just want to reset the timer
+ * and let the queue side take care of everything.
+ */
+ if (!completion_done(&cmd->send_complete)) {
+ nbd_config_put(nbd);
+ return BLK_EH_RESET_TIMER;
+ }
+ config = nbd->config;
+
+ if (config->num_connections > 1) {
+ dev_err_ratelimited(nbd_to_dev(nbd),
+ "Connection timed out, retrying\n");
+ /*
+ * Hooray we have more connections, requeue this IO, the submit
+ * path will put it on a real connection.
+ */
+ if (config->socks && config->num_connections > 1) {
+ if (cmd->index < config->num_connections) {
+ struct nbd_sock *nsock =
+ config->socks[cmd->index];
+ mutex_lock(&nsock->tx_lock);
+ /* We can have multiple outstanding requests, so
+ * we don't want to mark the nsock dead if we've
+ * already reconnected with a new socket, so
+ * only mark it dead if its the same socket we
+ * were sent out on.
+ */
+ if (cmd->cookie == nsock->cookie)
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
+ }
+ blk_mq_requeue_request(req, true);
+ nbd_config_put(nbd);
+ return BLK_EH_NOT_HANDLED;
+ }
+ } else {
+ dev_err_ratelimited(nbd_to_dev(nbd),
+ "Connection timed out\n");
+ }
+ set_bit(NBD_TIMEDOUT, &config->runtime_flags);
+ cmd->status = -EIO;
sock_shutdown(nbd);
- mutex_unlock(&nbd->config_lock);
+ nbd_config_put(nbd);
+
return BLK_EH_HANDLED;
}
@@ -207,7 +343,8 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
static int sock_xmit(struct nbd_device *nbd, int index, int send,
struct iov_iter *iter, int msg_flags, int *sent)
{
- struct socket *sock = nbd->socks[index]->sock;
+ struct nbd_config *config = nbd->config;
+ struct socket *sock = config->socks[index]->sock;
int result;
struct msghdr msg;
unsigned long pflags = current->flags;
@@ -253,7 +390,8 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
- struct nbd_sock *nsock = nbd->socks[index];
+ struct nbd_config *config = nbd->config;
+ struct nbd_sock *nsock = config->socks[index];
int result;
struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
@@ -284,7 +422,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
if (rq_data_dir(req) == WRITE &&
- (nbd->flags & NBD_FLAG_READ_ONLY)) {
+ (config->flags & NBD_FLAG_READ_ONLY)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Write on read-only\n");
return -EIO;
@@ -301,6 +439,8 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
iov_iter_advance(&from, sent);
}
+ cmd->index = index;
+ cmd->cookie = nsock->cookie;
request.type = htonl(type);
if (type != NBD_CMD_FLUSH) {
request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
@@ -328,7 +468,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
}
dev_err_ratelimited(disk_to_dev(nbd->disk),
"Send control failed (result %d)\n", result);
- return -EIO;
+ return -EAGAIN;
}
send_pages:
if (type != NBD_CMD_WRITE)
@@ -370,7 +510,7 @@ send_pages:
dev_err(disk_to_dev(nbd->disk),
"Send data failed (result %d)\n",
result);
- return -EIO;
+ return -EAGAIN;
}
/*
* The completion might already have come in,
@@ -392,6 +532,7 @@ out:
/* NULL returned = something went wrong, inform userspace */
static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
{
+ struct nbd_config *config = nbd->config;
int result;
struct nbd_reply reply;
struct nbd_cmd *cmd;
@@ -405,8 +546,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
iov_iter_kvec(&to, READ | ITER_KVEC, &iov, 1, sizeof(reply));
result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
if (result <= 0) {
- if (!test_bit(NBD_DISCONNECTED, &nbd->runtime_flags) &&
- !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
+ if (!nbd_disconnected(config))
dev_err(disk_to_dev(nbd->disk),
"Receive control failed (result %d)\n", result);
return ERR_PTR(result);
@@ -433,7 +573,7 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
if (ntohl(reply.error)) {
dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
ntohl(reply.error));
- req->errors = -EIO;
+ cmd->status = -EIO;
return cmd;
}
@@ -449,8 +589,19 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
if (result <= 0) {
dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
result);
- req->errors = -EIO;
- return cmd;
+ /*
+ * If we've disconnected or we only have 1
+ * connection then we need to make sure we
+ * complete this request, otherwise error out
+ * and let the timeout stuff handle resubmitting
+ * this request onto another connection.
+ */
+ if (nbd_disconnected(config) ||
+ config->num_connections <= 1) {
+ cmd->status = -EIO;
+ return cmd;
+ }
+ return ERR_PTR(-EIO);
}
dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
cmd, bvec.bv_len);
@@ -462,54 +613,34 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
return cmd;
}
-static ssize_t pid_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct gendisk *disk = dev_to_disk(dev);
- struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
-
- return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
-}
-
-static struct device_attribute pid_attr = {
- .attr = { .name = "pid", .mode = S_IRUGO},
- .show = pid_show,
-};
-
-struct recv_thread_args {
- struct work_struct work;
- struct nbd_device *nbd;
- int index;
-};
-
static void recv_work(struct work_struct *work)
{
struct recv_thread_args *args = container_of(work,
struct recv_thread_args,
work);
struct nbd_device *nbd = args->nbd;
+ struct nbd_config *config = nbd->config;
struct nbd_cmd *cmd;
int ret = 0;
- BUG_ON(nbd->magic != NBD_MAGIC);
while (1) {
cmd = nbd_read_stat(nbd, args->index);
if (IS_ERR(cmd)) {
+ struct nbd_sock *nsock = config->socks[args->index];
+
+ mutex_lock(&nsock->tx_lock);
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
ret = PTR_ERR(cmd);
break;
}
- nbd_end_request(cmd);
+ blk_mq_complete_request(blk_mq_rq_from_pdu(cmd));
}
-
- /*
- * We got an error, shut everybody down if this wasn't the result of a
- * disconnect request.
- */
- if (ret && !test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
- sock_shutdown(nbd);
- atomic_dec(&nbd->recv_threads);
- wake_up(&nbd->recv_wq);
+ atomic_dec(&config->recv_threads);
+ wake_up(&config->recv_wq);
+ nbd_config_put(nbd);
+ kfree(args);
}
static void nbd_clear_req(struct request *req, void *data, bool reserved)
@@ -519,47 +650,119 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)
if (!blk_mq_request_started(req))
return;
cmd = blk_mq_rq_to_pdu(req);
- req->errors = -EIO;
- nbd_end_request(cmd);
+ cmd->status = -EIO;
+ blk_mq_complete_request(req);
}
static void nbd_clear_que(struct nbd_device *nbd)
{
- BUG_ON(nbd->magic != NBD_MAGIC);
-
+ blk_mq_stop_hw_queues(nbd->disk->queue);
blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
+ blk_mq_start_hw_queues(nbd->disk->queue);
dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
}
+static int find_fallback(struct nbd_device *nbd, int index)
+{
+ struct nbd_config *config = nbd->config;
+ int new_index = -1;
+ struct nbd_sock *nsock = config->socks[index];
+ int fallback = nsock->fallback_index;
+
+ if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+ return new_index;
+
+ if (config->num_connections <= 1) {
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Attempted send on invalid socket\n");
+ return new_index;
+ }
+
+ if (fallback >= 0 && fallback < config->num_connections &&
+ !config->socks[fallback]->dead)
+ return fallback;
+
+ if (nsock->fallback_index < 0 ||
+ nsock->fallback_index >= config->num_connections ||
+ config->socks[nsock->fallback_index]->dead) {
+ int i;
+ for (i = 0; i < config->num_connections; i++) {
+ if (i == index)
+ continue;
+ if (!config->socks[i]->dead) {
+ new_index = i;
+ break;
+ }
+ }
+ nsock->fallback_index = new_index;
+ if (new_index < 0) {
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Dead connection, failed to find a fallback\n");
+ return new_index;
+ }
+ }
+ new_index = nsock->fallback_index;
+ return new_index;
+}
+
+static int wait_for_reconnect(struct nbd_device *nbd)
+{
+ struct nbd_config *config = nbd->config;
+ if (!config->dead_conn_timeout)
+ return 0;
+ if (test_bit(NBD_DISCONNECTED, &config->runtime_flags))
+ return 0;
+ wait_event_interruptible_timeout(config->conn_wait,
+ atomic_read(&config->live_connections),
+ config->dead_conn_timeout);
+ return atomic_read(&config->live_connections);
+}
static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
{
struct request *req = blk_mq_rq_from_pdu(cmd);
struct nbd_device *nbd = cmd->nbd;
+ struct nbd_config *config;
struct nbd_sock *nsock;
int ret;
- if (index >= nbd->num_connections) {
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on invalid socket\n");
+ "Socks array is empty\n");
return -EINVAL;
}
+ config = nbd->config;
- if (test_bit(NBD_DISCONNECTED, &nbd->runtime_flags)) {
+ if (index >= config->num_connections) {
dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on closed socket\n");
+ "Attempted send on invalid socket\n");
+ nbd_config_put(nbd);
return -EINVAL;
}
-
- req->errors = 0;
-
- nsock = nbd->socks[index];
+ cmd->status = 0;
+again:
+ nsock = config->socks[index];
mutex_lock(&nsock->tx_lock);
- if (unlikely(!nsock->sock)) {
+ if (nsock->dead) {
+ int old_index = index;
+ index = find_fallback(nbd, index);
mutex_unlock(&nsock->tx_lock);
- dev_err_ratelimited(disk_to_dev(nbd->disk),
- "Attempted send on closed socket\n");
- return -EINVAL;
+ if (index < 0) {
+ if (wait_for_reconnect(nbd)) {
+ index = old_index;
+ goto again;
+ }
+ /* All the sockets should already be down at this point,
+ * we just want to make sure that DISCONNECTED is set so
+ * any requests that come in that were queue'ed waiting
+ * for the reconnect timer don't trigger the timer again
+ * and instead just error out.
+ */
+ sock_shutdown(nbd);
+ nbd_config_put(nbd);
+ return -EIO;
+ }
+ goto again;
}
/* Handle the case that we have a pending request that was partially
@@ -572,9 +775,21 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
ret = 0;
goto out;
}
+ /*
+ * Some failures are related to the link going down, so anything that
+ * returns EAGAIN can be retried on a different socket.
+ */
ret = nbd_send_cmd(nbd, cmd, index);
+ if (ret == -EAGAIN) {
+ dev_err_ratelimited(disk_to_dev(nbd->disk),
+ "Request send failed trying another connection\n");
+ nbd_mark_nsock_dead(nbd, nsock, 1);
+ mutex_unlock(&nsock->tx_lock);
+ goto again;
+ }
out:
mutex_unlock(&nsock->tx_lock);
+ nbd_config_put(nbd);
return ret;
}
@@ -611,9 +826,10 @@ static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
return ret;
}
-static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
- unsigned long arg)
+static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
+ bool netlink)
{
+ struct nbd_config *config = nbd->config;
struct socket *sock;
struct nbd_sock **socks;
struct nbd_sock *nsock;
@@ -623,43 +839,107 @@ static int nbd_add_socket(struct nbd_device *nbd, struct block_device *bdev,
if (!sock)
return err;
- if (!nbd->task_setup)
+ if (!netlink && !nbd->task_setup &&
+ !test_bit(NBD_BOUND, &config->runtime_flags))
nbd->task_setup = current;
- if (nbd->task_setup != current) {
+
+ if (!netlink &&
+ (nbd->task_setup != current ||
+ test_bit(NBD_BOUND, &config->runtime_flags))) {
dev_err(disk_to_dev(nbd->disk),
"Device being setup by another task");
- return -EINVAL;
+ sockfd_put(sock);
+ return -EBUSY;
}
- socks = krealloc(nbd->socks, (nbd->num_connections + 1) *
+ socks = krealloc(config->socks, (config->num_connections + 1) *
sizeof(struct nbd_sock *), GFP_KERNEL);
- if (!socks)
+ if (!socks) {
+ sockfd_put(sock);
return -ENOMEM;
+ }
nsock = kzalloc(sizeof(struct nbd_sock), GFP_KERNEL);
- if (!nsock)
+ if (!nsock) {
+ sockfd_put(sock);
return -ENOMEM;
+ }
- nbd->socks = socks;
+ config->socks = socks;
+ nsock->fallback_index = -1;
+ nsock->dead = false;
mutex_init(&nsock->tx_lock);
nsock->sock = sock;
nsock->pending = NULL;
nsock->sent = 0;
- socks[nbd->num_connections++] = nsock;
+ nsock->cookie = 0;
+ socks[config->num_connections++] = nsock;
+ atomic_inc(&config->live_connections);
- if (max_part)
- bdev->bd_invalidated = 1;
return 0;
}
+static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
+{
+ struct nbd_config *config = nbd->config;
+ struct socket *sock, *old;
+ struct recv_thread_args *args;
+ int i;
+ int err;
+
+ sock = sockfd_lookup(arg, &err);
+ if (!sock)
+ return err;
+
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args) {
+ sockfd_put(sock);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < config->num_connections; i++) {
+ struct nbd_sock *nsock = config->socks[i];
+
+ if (!nsock->dead)
+ continue;
+
+ mutex_lock(&nsock->tx_lock);
+ if (!nsock->dead) {
+ mutex_unlock(&nsock->tx_lock);
+ continue;
+ }
+ sk_set_memalloc(sock->sk);
+ atomic_inc(&config->recv_threads);
+ refcount_inc(&nbd->config_refs);
+ old = nsock->sock;
+ nsock->fallback_index = -1;
+ nsock->sock = sock;
+ nsock->dead = false;
+ INIT_WORK(&args->work, recv_work);
+ args->index = i;
+ args->nbd = nbd;
+ nsock->cookie++;
+ mutex_unlock(&nsock->tx_lock);
+ sockfd_put(old);
+
+ /* We take the tx_mutex in an error path in the recv_work, so we
+ * need to queue_work outside of the tx_mutex.
+ */
+ queue_work(recv_workqueue, &args->work);
+
+ atomic_inc(&config->live_connections);
+ wake_up(&config->conn_wait);
+ return 0;
+ }
+ sockfd_put(sock);
+ kfree(args);
+ return -ENOSPC;
+}
+
/* Reset all properties of an NBD device */
static void nbd_reset(struct nbd_device *nbd)
{
- nbd->runtime_flags = 0;
- nbd->blksize = 1024;
- nbd->bytesize = 0;
- set_capacity(nbd->disk, 0);
- nbd->flags = 0;
+ nbd->config = NULL;
nbd->tag_set.timeout = 0;
queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
}
@@ -668,21 +948,23 @@ static void nbd_bdev_reset(struct block_device *bdev)
{
if (bdev->bd_openers > 1)
return;
- set_device_ro(bdev, false);
- bdev->bd_inode->i_size = 0;
+ bd_set_size(bdev, 0);
if (max_part > 0) {
blkdev_reread_part(bdev);
bdev->bd_invalidated = 1;
}
}
-static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_parse_flags(struct nbd_device *nbd)
{
- if (nbd->flags & NBD_FLAG_READ_ONLY)
- set_device_ro(bdev, true);
- if (nbd->flags & NBD_FLAG_SEND_TRIM)
+ struct nbd_config *config = nbd->config;
+ if (config->flags & NBD_FLAG_READ_ONLY)
+ set_disk_ro(nbd->disk, true);
+ else
+ set_disk_ro(nbd->disk, false);
+ if (config->flags & NBD_FLAG_SEND_TRIM)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
- if (nbd->flags & NBD_FLAG_SEND_FLUSH)
+ if (config->flags & NBD_FLAG_SEND_FLUSH)
blk_queue_write_cache(nbd->disk->queue, true, false);
else
blk_queue_write_cache(nbd->disk->queue, false, false);
@@ -690,6 +972,7 @@ static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
static void send_disconnects(struct nbd_device *nbd)
{
+ struct nbd_config *config = nbd->config;
struct nbd_request request = {
.magic = htonl(NBD_REQUEST_MAGIC),
.type = htonl(NBD_CMD_DISC),
@@ -698,7 +981,7 @@ static void send_disconnects(struct nbd_device *nbd)
struct iov_iter from;
int i, ret;
- for (i = 0; i < nbd->num_connections; i++) {
+ for (i = 0; i < config->num_connections; i++) {
iov_iter_kvec(&from, WRITE | ITER_KVEC, &iov, 1, sizeof(request));
ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
if (ret <= 0)
@@ -707,145 +990,162 @@ static void send_disconnects(struct nbd_device *nbd)
}
}
-static int nbd_disconnect(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_disconnect(struct nbd_device *nbd)
{
- dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
- if (!nbd->socks)
- return -EINVAL;
-
- mutex_unlock(&nbd->config_lock);
- fsync_bdev(bdev);
- mutex_lock(&nbd->config_lock);
-
- /* Check again after getting mutex back. */
- if (!nbd->socks)
- return -EINVAL;
+ struct nbd_config *config = nbd->config;
+ dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
if (!test_and_set_bit(NBD_DISCONNECT_REQUESTED,
- &nbd->runtime_flags))
+ &config->runtime_flags))
send_disconnects(nbd);
return 0;
}
-static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
+static void nbd_clear_sock(struct nbd_device *nbd)
{
sock_shutdown(nbd);
nbd_clear_que(nbd);
+ nbd->task_setup = NULL;
+}
- __invalidate_device(bdev, true);
- nbd_bdev_reset(bdev);
- /*
- * We want to give the run thread a chance to wait for everybody
- * to clean up and then do it's own cleanup.
- */
- if (!test_bit(NBD_RUNNING, &nbd->runtime_flags) &&
- nbd->num_connections) {
- int i;
-
- for (i = 0; i < nbd->num_connections; i++) {
- sockfd_put(nbd->socks[i]->sock);
- kfree(nbd->socks[i]);
+static void nbd_config_put(struct nbd_device *nbd)
+{
+ if (refcount_dec_and_mutex_lock(&nbd->config_refs,
+ &nbd->config_lock)) {
+ struct nbd_config *config = nbd->config;
+ nbd_dev_dbg_close(nbd);
+ nbd_size_clear(nbd);
+ if (test_and_clear_bit(NBD_HAS_PID_FILE,
+ &config->runtime_flags))
+ device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+ nbd->task_recv = NULL;
+ nbd_clear_sock(nbd);
+ if (config->num_connections) {
+ int i;
+ for (i = 0; i < config->num_connections; i++) {
+ sockfd_put(config->socks[i]->sock);
+ kfree(config->socks[i]);
+ }
+ kfree(config->socks);
}
- kfree(nbd->socks);
- nbd->socks = NULL;
- nbd->num_connections = 0;
- }
- nbd->task_setup = NULL;
+ nbd_reset(nbd);
- return 0;
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ module_put(THIS_MODULE);
+ }
}
-static int nbd_start_device(struct nbd_device *nbd, struct block_device *bdev)
+static int nbd_start_device(struct nbd_device *nbd)
{
- struct recv_thread_args *args;
- int num_connections = nbd->num_connections;
+ struct nbd_config *config = nbd->config;
+ int num_connections = config->num_connections;
int error = 0, i;
if (nbd->task_recv)
return -EBUSY;
- if (!nbd->socks)
+ if (!config->socks)
return -EINVAL;
if (num_connections > 1 &&
- !(nbd->flags & NBD_FLAG_CAN_MULTI_CONN)) {
+ !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
- error = -EINVAL;
- goto out_err;
+ return -EINVAL;
}
- set_bit(NBD_RUNNING, &nbd->runtime_flags);
- blk_mq_update_nr_hw_queues(&nbd->tag_set, nbd->num_connections);
- args = kcalloc(num_connections, sizeof(*args), GFP_KERNEL);
- if (!args) {
- error = -ENOMEM;
- goto out_err;
- }
+ blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
nbd->task_recv = current;
- mutex_unlock(&nbd->config_lock);
- nbd_parse_flags(nbd, bdev);
+ nbd_parse_flags(nbd);
error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
if (error) {
dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
- goto out_recv;
+ return error;
}
-
- nbd_size_update(nbd, bdev);
+ set_bit(NBD_HAS_PID_FILE, &config->runtime_flags);
nbd_dev_dbg_init(nbd);
for (i = 0; i < num_connections; i++) {
- sk_set_memalloc(nbd->socks[i]->sock->sk);
- atomic_inc(&nbd->recv_threads);
- INIT_WORK(&args[i].work, recv_work);
- args[i].nbd = nbd;
- args[i].index = i;
- queue_work(recv_workqueue, &args[i].work);
- }
- wait_event_interruptible(nbd->recv_wq,
- atomic_read(&nbd->recv_threads) == 0);
- for (i = 0; i < num_connections; i++)
- flush_work(&args[i].work);
- nbd_dev_dbg_close(nbd);
- nbd_size_clear(nbd, bdev);
- device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
-out_recv:
- mutex_lock(&nbd->config_lock);
- nbd->task_recv = NULL;
-out_err:
- clear_bit(NBD_RUNNING, &nbd->runtime_flags);
- nbd_clear_sock(nbd, bdev);
+ struct recv_thread_args *args;
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args) {
+ sock_shutdown(nbd);
+ return -ENOMEM;
+ }
+ sk_set_memalloc(config->socks[i]->sock->sk);
+ atomic_inc(&config->recv_threads);
+ refcount_inc(&nbd->config_refs);
+ INIT_WORK(&args->work, recv_work);
+ args->nbd = nbd;
+ args->index = i;
+ queue_work(recv_workqueue, &args->work);
+ }
+ return error;
+}
+
+static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
+{
+ struct nbd_config *config = nbd->config;
+ int ret;
+
+ ret = nbd_start_device(nbd);
+ if (ret)
+ return ret;
+
+ bd_set_size(bdev, config->bytesize);
+ if (max_part)
+ bdev->bd_invalidated = 1;
+ mutex_unlock(&nbd->config_lock);
+ ret = wait_event_interruptible(config->recv_wq,
+ atomic_read(&config->recv_threads) == 0);
+ if (ret)
+ sock_shutdown(nbd);
+ mutex_lock(&nbd->config_lock);
+ bd_set_size(bdev, 0);
/* user requested, ignore socket errors */
- if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags))
- error = 0;
- if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags))
- error = -ETIMEDOUT;
+ if (test_bit(NBD_DISCONNECT_REQUESTED, &config->runtime_flags))
+ ret = 0;
+ if (test_bit(NBD_TIMEDOUT, &config->runtime_flags))
+ ret = -ETIMEDOUT;
+ return ret;
+}
- nbd_reset(nbd);
- return error;
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
+ struct block_device *bdev)
+{
+ sock_shutdown(nbd);
+ kill_bdev(bdev);
+ nbd_bdev_reset(bdev);
+ if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+ &nbd->config->runtime_flags))
+ nbd_config_put(nbd);
}
/* Must be called with config_lock held */
static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
unsigned int cmd, unsigned long arg)
{
+ struct nbd_config *config = nbd->config;
+
switch (cmd) {
case NBD_DISCONNECT:
- return nbd_disconnect(nbd, bdev);
+ return nbd_disconnect(nbd);
case NBD_CLEAR_SOCK:
- return nbd_clear_sock(nbd, bdev);
+ nbd_clear_sock_ioctl(nbd, bdev);
+ return 0;
case NBD_SET_SOCK:
- return nbd_add_socket(nbd, bdev, arg);
+ return nbd_add_socket(nbd, arg, false);
case NBD_SET_BLKSIZE:
- nbd_size_set(nbd, bdev, arg,
- div_s64(nbd->bytesize, arg));
+ nbd_size_set(nbd, arg,
+ div_s64(config->bytesize, arg));
return 0;
case NBD_SET_SIZE:
- nbd_size_set(nbd, bdev, nbd->blksize,
- div_s64(arg, nbd->blksize));
+ nbd_size_set(nbd, config->blksize,
+ div_s64(arg, config->blksize));
return 0;
case NBD_SET_SIZE_BLOCKS:
- nbd_size_set(nbd, bdev, nbd->blksize, arg);
+ nbd_size_set(nbd, config->blksize, arg);
return 0;
case NBD_SET_TIMEOUT:
if (arg) {
@@ -855,10 +1155,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
return 0;
case NBD_SET_FLAGS:
- nbd->flags = arg;
+ config->flags = arg;
return 0;
case NBD_DO_IT:
- return nbd_start_device(nbd, bdev);
+ return nbd_start_device_ioctl(nbd, bdev);
case NBD_CLEAR_QUE:
/*
* This is for compatibility only. The queue is always cleared
@@ -879,23 +1179,92 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nbd_device *nbd = bdev->bd_disk->private_data;
- int error;
+ struct nbd_config *config = nbd->config;
+ int error = -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- BUG_ON(nbd->magic != NBD_MAGIC);
-
mutex_lock(&nbd->config_lock);
- error = __nbd_ioctl(bdev, nbd, cmd, arg);
- mutex_unlock(&nbd->config_lock);
+ /* Don't allow ioctl operations on a nbd device that was created with
+ * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
+ */
+ if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+ (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
+ error = __nbd_ioctl(bdev, nbd, cmd, arg);
+ else
+ dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
+ mutex_unlock(&nbd->config_lock);
return error;
}
+static struct nbd_config *nbd_alloc_config(void)
+{
+ struct nbd_config *config;
+
+ config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
+ if (!config)
+ return NULL;
+ atomic_set(&config->recv_threads, 0);
+ init_waitqueue_head(&config->recv_wq);
+ init_waitqueue_head(&config->conn_wait);
+ config->blksize = 1024;
+ atomic_set(&config->live_connections, 0);
+ try_module_get(THIS_MODULE);
+ return config;
+}
+
+static int nbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct nbd_device *nbd;
+ int ret = 0;
+
+ mutex_lock(&nbd_index_mutex);
+ nbd = bdev->bd_disk->private_data;
+ if (!nbd) {
+ ret = -ENXIO;
+ goto out;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ ret = -ENXIO;
+ goto out;
+ }
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ struct nbd_config *config;
+
+ mutex_lock(&nbd->config_lock);
+ if (refcount_inc_not_zero(&nbd->config_refs)) {
+ mutex_unlock(&nbd->config_lock);
+ goto out;
+ }
+ config = nbd->config = nbd_alloc_config();
+ if (!config) {
+ ret = -ENOMEM;
+ mutex_unlock(&nbd->config_lock);
+ goto out;
+ }
+ refcount_set(&nbd->config_refs, 1);
+ refcount_inc(&nbd->refs);
+ mutex_unlock(&nbd->config_lock);
+ }
+out:
+ mutex_unlock(&nbd_index_mutex);
+ return ret;
+}
+
+static void nbd_release(struct gendisk *disk, fmode_t mode)
+{
+ struct nbd_device *nbd = disk->private_data;
+ nbd_config_put(nbd);
+ nbd_put(nbd);
+}
+
static const struct block_device_operations nbd_fops =
{
.owner = THIS_MODULE,
+ .open = nbd_open,
+ .release = nbd_release,
.ioctl = nbd_ioctl,
.compat_ioctl = nbd_ioctl,
};
@@ -927,7 +1296,7 @@ static const struct file_operations nbd_dbg_tasks_ops = {
static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
{
struct nbd_device *nbd = s->private;
- u32 flags = nbd->flags;
+ u32 flags = nbd->config->flags;
seq_printf(s, "Hex: 0x%08x\n\n", flags);
@@ -960,6 +1329,7 @@ static const struct file_operations nbd_dbg_flags_ops = {
static int nbd_dev_dbg_init(struct nbd_device *nbd)
{
struct dentry *dir;
+ struct nbd_config *config = nbd->config;
if (!nbd_dbg_dir)
return -EIO;
@@ -970,12 +1340,12 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
nbd_name(nbd));
return -EIO;
}
- nbd->dbg_dir = dir;
+ config->dbg_dir = dir;
debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
- debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
+ debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
- debugfs_create_u64("blocksize", 0444, dir, &nbd->blksize);
+ debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
return 0;
@@ -983,7 +1353,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd)
static void nbd_dev_dbg_close(struct nbd_device *nbd)
{
- debugfs_remove_recursive(nbd->dbg_dir);
+ debugfs_remove_recursive(nbd->config->dbg_dir);
}
static int nbd_dbg_init(void)
@@ -1035,25 +1405,13 @@ static int nbd_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops nbd_mq_ops = {
+static const struct blk_mq_ops nbd_mq_ops = {
.queue_rq = nbd_queue_rq,
+ .complete = nbd_complete_rq,
.init_request = nbd_init_request,
.timeout = nbd_xmit_timeout,
};
-static void nbd_dev_remove(struct nbd_device *nbd)
-{
- struct gendisk *disk = nbd->disk;
- nbd->magic = 0;
- if (disk) {
- del_gendisk(disk);
- blk_cleanup_queue(disk->queue);
- blk_mq_free_tag_set(&nbd->tag_set);
- put_disk(disk);
- }
- kfree(nbd);
-}
-
static int nbd_dev_add(int index)
{
struct nbd_device *nbd;
@@ -1082,6 +1440,7 @@ static int nbd_dev_add(int index)
if (err < 0)
goto out_free_disk;
+ nbd->index = index;
nbd->disk = disk;
nbd->tag_set.ops = &nbd_mq_ops;
nbd->tag_set.nr_hw_queues = 1;
@@ -1110,20 +1469,23 @@ static int nbd_dev_add(int index)
queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
disk->queue->limits.discard_granularity = 512;
blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
- disk->queue->limits.discard_zeroes_data = 0;
+ blk_queue_max_segment_size(disk->queue, UINT_MAX);
+ blk_queue_max_segments(disk->queue, USHRT_MAX);
blk_queue_max_hw_sectors(disk->queue, 65536);
disk->queue->limits.max_sectors = 256;
- nbd->magic = NBD_MAGIC;
mutex_init(&nbd->config_lock);
+ refcount_set(&nbd->config_refs, 0);
+ refcount_set(&nbd->refs, 1);
+ INIT_LIST_HEAD(&nbd->list);
disk->major = NBD_MAJOR;
disk->first_minor = index << part_shift;
disk->fops = &nbd_fops;
disk->private_data = nbd;
sprintf(disk->disk_name, "nbd%d", index);
- init_waitqueue_head(&nbd->recv_wq);
nbd_reset(nbd);
add_disk(disk);
+ nbd_total_devices++;
return index;
out_free_tags:
@@ -1138,10 +1500,535 @@ out:
return err;
}
-/*
- * And here should be modules and kernel interface
- * (Just smiley confuses emacs :-)
+static int find_free_cb(int id, void *ptr, void *data)
+{
+ struct nbd_device *nbd = ptr;
+ struct nbd_device **found = data;
+
+ if (!refcount_read(&nbd->config_refs)) {
+ *found = nbd;
+ return 1;
+ }
+ return 0;
+}
+
+/* Netlink interface. */
+static struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
+ [NBD_ATTR_INDEX] = { .type = NLA_U32 },
+ [NBD_ATTR_SIZE_BYTES] = { .type = NLA_U64 },
+ [NBD_ATTR_BLOCK_SIZE_BYTES] = { .type = NLA_U64 },
+ [NBD_ATTR_TIMEOUT] = { .type = NLA_U64 },
+ [NBD_ATTR_SERVER_FLAGS] = { .type = NLA_U64 },
+ [NBD_ATTR_CLIENT_FLAGS] = { .type = NLA_U64 },
+ [NBD_ATTR_SOCKETS] = { .type = NLA_NESTED},
+ [NBD_ATTR_DEAD_CONN_TIMEOUT] = { .type = NLA_U64 },
+ [NBD_ATTR_DEVICE_LIST] = { .type = NLA_NESTED},
+};
+
+static struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
+ [NBD_SOCK_FD] = { .type = NLA_U32 },
+};
+
+/* We don't use this right now since we don't parse the incoming list, but we
+ * still want it here so userspace knows what to expect.
*/
+static struct nla_policy __attribute__((unused))
+nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
+ [NBD_DEVICE_INDEX] = { .type = NLA_U32 },
+ [NBD_DEVICE_CONNECTED] = { .type = NLA_U8 },
+};
+
+static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nbd_device *nbd = NULL;
+ struct nbd_config *config;
+ int index = -1;
+ int ret;
+ bool put_dev = false;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (info->attrs[NBD_ATTR_INDEX])
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+ if (!info->attrs[NBD_ATTR_SOCKETS]) {
+ printk(KERN_ERR "nbd: must specify at least one socket\n");
+ return -EINVAL;
+ }
+ if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
+ printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
+ return -EINVAL;
+ }
+again:
+ mutex_lock(&nbd_index_mutex);
+ if (index == -1) {
+ ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
+ if (ret == 0) {
+ int new_index;
+ new_index = nbd_dev_add(-1);
+ if (new_index < 0) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: failed to add new device\n");
+ return ret;
+ }
+ nbd = idr_find(&nbd_index_idr, new_index);
+ }
+ } else {
+ nbd = idr_find(&nbd_index_idr, index);
+ }
+ if (!nbd) {
+ printk(KERN_ERR "nbd: couldn't find device at index %d\n",
+ index);
+ mutex_unlock(&nbd_index_mutex);
+ return -EINVAL;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ mutex_unlock(&nbd_index_mutex);
+ if (index == -1)
+ goto again;
+ printk(KERN_ERR "nbd: device at index %d is going down\n",
+ index);
+ return -EINVAL;
+ }
+ mutex_unlock(&nbd_index_mutex);
+
+ mutex_lock(&nbd->config_lock);
+ if (refcount_read(&nbd->config_refs)) {
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ if (index == -1)
+ goto again;
+ printk(KERN_ERR "nbd: nbd%d already in use\n", index);
+ return -EBUSY;
+ }
+ if (WARN_ON(nbd->config)) {
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ return -EINVAL;
+ }
+ config = nbd->config = nbd_alloc_config();
+ if (!nbd->config) {
+ mutex_unlock(&nbd->config_lock);
+ nbd_put(nbd);
+ printk(KERN_ERR "nbd: couldn't allocate config\n");
+ return -ENOMEM;
+ }
+ refcount_set(&nbd->config_refs, 1);
+ set_bit(NBD_BOUND, &config->runtime_flags);
+
+ if (info->attrs[NBD_ATTR_SIZE_BYTES]) {
+ u64 bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
+ nbd_size_set(nbd, config->blksize,
+ div64_u64(bytes, config->blksize));
+ }
+ if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
+ u64 bsize =
+ nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
+ nbd_size_set(nbd, bsize, div64_u64(config->bytesize, bsize));
+ }
+ if (info->attrs[NBD_ATTR_TIMEOUT]) {
+ u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
+ nbd->tag_set.timeout = timeout * HZ;
+ blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+ }
+ if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+ config->dead_conn_timeout =
+ nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+ config->dead_conn_timeout *= HZ;
+ }
+ if (info->attrs[NBD_ATTR_SERVER_FLAGS])
+ config->flags =
+ nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
+ if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+ u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+ if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+ set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &config->runtime_flags);
+ put_dev = true;
+ }
+ }
+
+ if (info->attrs[NBD_ATTR_SOCKETS]) {
+ struct nlattr *attr;
+ int rem, fd;
+
+ nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+ rem) {
+ struct nlattr *socks[NBD_SOCK_MAX+1];
+
+ if (nla_type(attr) != NBD_SOCK_ITEM) {
+ printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
+ nbd_sock_policy);
+ if (ret != 0) {
+ printk(KERN_ERR "nbd: error processing sock list\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!socks[NBD_SOCK_FD])
+ continue;
+ fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+ ret = nbd_add_socket(nbd, fd, true);
+ if (ret)
+ goto out;
+ }
+ }
+ ret = nbd_start_device(nbd);
+out:
+ mutex_unlock(&nbd->config_lock);
+ if (!ret) {
+ set_bit(NBD_HAS_CONFIG_REF, &config->runtime_flags);
+ refcount_inc(&nbd->config_refs);
+ nbd_connect_reply(info, nbd->index);
+ }
+ nbd_config_put(nbd);
+ if (put_dev)
+ nbd_put(nbd);
+ return ret;
+}
+
+static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nbd_device *nbd;
+ int index;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!info->attrs[NBD_ATTR_INDEX]) {
+ printk(KERN_ERR "nbd: must specify an index to disconnect\n");
+ return -EINVAL;
+ }
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+ mutex_lock(&nbd_index_mutex);
+ nbd = idr_find(&nbd_index_idr, index);
+ if (!nbd) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: couldn't find device at index %d\n",
+ index);
+ return -EINVAL;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: device at index %d is going down\n",
+ index);
+ return -EINVAL;
+ }
+ mutex_unlock(&nbd_index_mutex);
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ nbd_put(nbd);
+ return 0;
+ }
+ mutex_lock(&nbd->config_lock);
+ nbd_disconnect(nbd);
+ mutex_unlock(&nbd->config_lock);
+ if (test_and_clear_bit(NBD_HAS_CONFIG_REF,
+ &nbd->config->runtime_flags))
+ nbd_config_put(nbd);
+ nbd_config_put(nbd);
+ nbd_put(nbd);
+ return 0;
+}
+
+static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nbd_device *nbd = NULL;
+ struct nbd_config *config;
+ int index;
+ int ret = -EINVAL;
+ bool put_dev = false;
+
+ if (!netlink_capable(skb, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!info->attrs[NBD_ATTR_INDEX]) {
+ printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
+ return -EINVAL;
+ }
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+ mutex_lock(&nbd_index_mutex);
+ nbd = idr_find(&nbd_index_idr, index);
+ if (!nbd) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
+ index);
+ return -EINVAL;
+ }
+ if (!refcount_inc_not_zero(&nbd->refs)) {
+ mutex_unlock(&nbd_index_mutex);
+ printk(KERN_ERR "nbd: device at index %d is going down\n",
+ index);
+ return -EINVAL;
+ }
+ mutex_unlock(&nbd_index_mutex);
+
+ if (!refcount_inc_not_zero(&nbd->config_refs)) {
+ dev_err(nbd_to_dev(nbd),
+ "not configured, cannot reconfigure\n");
+ nbd_put(nbd);
+ return -EINVAL;
+ }
+
+ mutex_lock(&nbd->config_lock);
+ config = nbd->config;
+ if (!test_bit(NBD_BOUND, &config->runtime_flags) ||
+ !nbd->task_recv) {
+ dev_err(nbd_to_dev(nbd),
+ "not configured, cannot reconfigure\n");
+ goto out;
+ }
+
+ if (info->attrs[NBD_ATTR_TIMEOUT]) {
+ u64 timeout = nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]);
+ nbd->tag_set.timeout = timeout * HZ;
+ blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+ }
+ if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+ config->dead_conn_timeout =
+ nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+ config->dead_conn_timeout *= HZ;
+ }
+ if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+ u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+ if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+ if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+ &config->runtime_flags))
+ put_dev = true;
+ } else {
+ if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+ &config->runtime_flags))
+ refcount_inc(&nbd->refs);
+ }
+ }
+
+ if (info->attrs[NBD_ATTR_SOCKETS]) {
+ struct nlattr *attr;
+ int rem, fd;
+
+ nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+ rem) {
+ struct nlattr *socks[NBD_SOCK_MAX+1];
+
+ if (nla_type(attr) != NBD_SOCK_ITEM) {
+ printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = nla_parse_nested(socks, NBD_SOCK_MAX, attr,
+ nbd_sock_policy);
+ if (ret != 0) {
+ printk(KERN_ERR "nbd: error processing sock list\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!socks[NBD_SOCK_FD])
+ continue;
+ fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+ ret = nbd_reconnect_socket(nbd, fd);
+ if (ret) {
+ if (ret == -ENOSPC)
+ ret = 0;
+ goto out;
+ }
+ dev_info(nbd_to_dev(nbd), "reconnected socket\n");
+ }
+ }
+out:
+ mutex_unlock(&nbd->config_lock);
+ nbd_config_put(nbd);
+ nbd_put(nbd);
+ if (put_dev)
+ nbd_put(nbd);
+ return ret;
+}
+
+static const struct genl_ops nbd_connect_genl_ops[] = {
+ {
+ .cmd = NBD_CMD_CONNECT,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_connect,
+ },
+ {
+ .cmd = NBD_CMD_DISCONNECT,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_disconnect,
+ },
+ {
+ .cmd = NBD_CMD_RECONFIGURE,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_reconfigure,
+ },
+ {
+ .cmd = NBD_CMD_STATUS,
+ .policy = nbd_attr_policy,
+ .doit = nbd_genl_status,
+ },
+};
+
+static const struct genl_multicast_group nbd_mcast_grps[] = {
+ { .name = NBD_GENL_MCAST_GROUP_NAME, },
+};
+
+static struct genl_family nbd_genl_family __ro_after_init = {
+ .hdrsize = 0,
+ .name = NBD_GENL_FAMILY_NAME,
+ .version = NBD_GENL_VERSION,
+ .module = THIS_MODULE,
+ .ops = nbd_connect_genl_ops,
+ .n_ops = ARRAY_SIZE(nbd_connect_genl_ops),
+ .maxattr = NBD_ATTR_MAX,
+ .mcgrps = nbd_mcast_grps,
+ .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
+};
+
+static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
+{
+ struct nlattr *dev_opt;
+ u8 connected = 0;
+ int ret;
+
+ /* This is a little racey, but for status it's ok. The
+ * reason we don't take a ref here is because we can't
+ * take a ref in the index == -1 case as we would need
+ * to put under the nbd_index_mutex, which could
+ * deadlock if we are configured to remove ourselves
+ * once we're disconnected.
+ */
+ if (refcount_read(&nbd->config_refs))
+ connected = 1;
+ dev_opt = nla_nest_start(reply, NBD_DEVICE_ITEM);
+ if (!dev_opt)
+ return -EMSGSIZE;
+ ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
+ if (ret)
+ return -EMSGSIZE;
+ ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
+ connected);
+ if (ret)
+ return -EMSGSIZE;
+ nla_nest_end(reply, dev_opt);
+ return 0;
+}
+
+static int status_cb(int id, void *ptr, void *data)
+{
+ struct nbd_device *nbd = ptr;
+ return populate_nbd_status(nbd, (struct sk_buff *)data);
+}
+
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *dev_list;
+ struct sk_buff *reply;
+ void *reply_head;
+ size_t msg_size;
+ int index = -1;
+ int ret = -ENOMEM;
+
+ if (info->attrs[NBD_ATTR_INDEX])
+ index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+
+ mutex_lock(&nbd_index_mutex);
+
+ msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
+ nla_attr_size(sizeof(u8)));
+ msg_size *= (index == -1) ? nbd_total_devices : 1;
+
+ reply = genlmsg_new(msg_size, GFP_KERNEL);
+ if (!reply)
+ goto out;
+ reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
+ NBD_CMD_STATUS);
+ if (!reply_head) {
+ nlmsg_free(reply);
+ goto out;
+ }
+
+ dev_list = nla_nest_start(reply, NBD_ATTR_DEVICE_LIST);
+ if (index == -1) {
+ ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
+ if (ret) {
+ nlmsg_free(reply);
+ goto out;
+ }
+ } else {
+ struct nbd_device *nbd;
+ nbd = idr_find(&nbd_index_idr, index);
+ if (nbd) {
+ ret = populate_nbd_status(nbd, reply);
+ if (ret) {
+ nlmsg_free(reply);
+ goto out;
+ }
+ }
+ }
+ nla_nest_end(reply, dev_list);
+ genlmsg_end(reply, reply_head);
+ genlmsg_reply(reply, info);
+ ret = 0;
+out:
+ mutex_unlock(&nbd_index_mutex);
+ return ret;
+}
+
+static void nbd_connect_reply(struct genl_info *info, int index)
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+
+ skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+ if (!skb)
+ return;
+ msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
+ NBD_CMD_CONNECT);
+ if (!msg_head) {
+ nlmsg_free(skb);
+ return;
+ }
+ ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+ if (ret) {
+ nlmsg_free(skb);
+ return;
+ }
+ genlmsg_end(skb, msg_head);
+ genlmsg_reply(skb, info);
+}
+
+static void nbd_mcast_index(int index)
+{
+ struct sk_buff *skb;
+ void *msg_head;
+ int ret;
+
+ skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+ if (!skb)
+ return;
+ msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
+ NBD_CMD_LINK_DEAD);
+ if (!msg_head) {
+ nlmsg_free(skb);
+ return;
+ }
+ ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+ if (ret) {
+ nlmsg_free(skb);
+ return;
+ }
+ genlmsg_end(skb, msg_head);
+ genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
+}
+
+static void nbd_dead_link_work(struct work_struct *work)
+{
+ struct link_dead_args *args = container_of(work, struct link_dead_args,
+ work);
+ nbd_mcast_index(args->index);
+ kfree(args);
+}
static int __init nbd_init(void)
{
@@ -1184,6 +2071,11 @@ static int __init nbd_init(void)
return -EIO;
}
+ if (genl_register_family(&nbd_genl_family)) {
+ unregister_blkdev(NBD_MAJOR, "nbd");
+ destroy_workqueue(recv_workqueue);
+ return -EINVAL;
+ }
nbd_dbg_init();
mutex_lock(&nbd_index_mutex);
@@ -1195,17 +2087,34 @@ static int __init nbd_init(void)
static int nbd_exit_cb(int id, void *ptr, void *data)
{
+ struct list_head *list = (struct list_head *)data;
struct nbd_device *nbd = ptr;
- nbd_dev_remove(nbd);
+
+ list_add_tail(&nbd->list, list);
return 0;
}
static void __exit nbd_cleanup(void)
{
+ struct nbd_device *nbd;
+ LIST_HEAD(del_list);
+
nbd_dbg_close();
- idr_for_each(&nbd_index_idr, &nbd_exit_cb, NULL);
+ mutex_lock(&nbd_index_mutex);
+ idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
+ mutex_unlock(&nbd_index_mutex);
+
+ while (!list_empty(&del_list)) {
+ nbd = list_first_entry(&del_list, struct nbd_device, list);
+ list_del_init(&nbd->list);
+ if (refcount_read(&nbd->refs) != 1)
+ printk(KERN_ERR "nbd: possibly leaking a device\n");
+ nbd_put(nbd);
+ }
+
idr_destroy(&nbd_index_idr);
+ genl_unregister_family(&nbd_genl_family);
destroy_workqueue(recv_workqueue);
unregister_blkdev(NBD_MAJOR, "nbd");
}
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 6f2e565bccc59..d946e1eeac8ef 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -117,6 +117,10 @@ static bool use_lightnvm;
module_param(use_lightnvm, bool, S_IRUGO);
MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
+static bool blocking;
+module_param(blocking, bool, S_IRUGO);
+MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
+
static int irqmode = NULL_IRQ_SOFTIRQ;
static int null_set_irqmode(const char *str, const struct kernel_param *kp)
@@ -277,7 +281,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd)
case NULL_IRQ_SOFTIRQ:
switch (queue_mode) {
case NULL_Q_MQ:
- blk_mq_complete_request(cmd->rq, cmd->rq->errors);
+ blk_mq_complete_request(cmd->rq);
break;
case NULL_Q_RQ:
blk_complete_request(cmd->rq);
@@ -357,6 +361,8 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
{
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+
if (irqmode == NULL_IRQ_TIMER) {
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
@@ -392,7 +398,7 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
-static struct blk_mq_ops null_mq_ops = {
+static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
.init_hctx = null_init_hctx,
.complete = null_softirq_done_fn,
@@ -437,14 +443,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
if (IS_ERR(rq))
return -ENOMEM;
- rq->__sector = bio->bi_iter.bi_sector;
- rq->ioprio = bio_prio(bio);
-
- if (bio_has_data(bio))
- rq->nr_phys_segments = bio_phys_segments(q, bio);
-
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
+ blk_init_request_from_bio(rq, bio);
rq->end_io_data = rqd;
@@ -724,6 +723,9 @@ static int null_add_dev(void)
nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
nullb->tag_set.driver_data = nullb;
+ if (blocking)
+ nullb->tag_set.flags |= BLK_MQ_F_BLOCKING;
+
rv = blk_mq_alloc_tag_set(&nullb->tag_set);
if (rv)
goto out_cleanup_queues;
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
deleted file mode 100644
index 8127b8201a011..0000000000000
--- a/drivers/block/osdblk.c
+++ /dev/null
@@ -1,693 +0,0 @@
-
-/*
- osdblk.c -- Export a single SCSI OSD object as a Linux block device
-
-
- Copyright 2009 Red Hat, Inc.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; see the file COPYING. If not, write to
- the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
-
-
- Instructions for use
- --------------------
-
- 1) Map a Linux block device to an existing OSD object.
-
- In this example, we will use partition id 1234, object id 5678,
- OSD device /dev/osd1.
-
- $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
-
-
- 2) List all active blkdev<->object mappings.
-
- In this example, we have performed step #1 twice, creating two blkdevs,
- mapped to two separate OSD objects.
-
- $ cat /sys/class/osdblk/list
- 0 174 1234 5678 /dev/osd1
- 1 179 1994 897123 /dev/osd0
-
- The columns, in order, are:
- - blkdev unique id
- - blkdev assigned major
- - OSD object partition id
- - OSD object id
- - OSD device
-
-
- 3) Remove an active blkdev<->object mapping.
-
- In this example, we remove the mapping with blkdev unique id 1.
-
- $ echo 1 > /sys/class/osdblk/remove
-
-
- NOTE: The actual creation and deletion of OSD objects is outside the scope
- of this driver.
-
- */
-
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include <scsi/osd_initiator.h>
-#include <scsi/osd_attributes.h>
-#include <scsi/osd_sec.h>
-#include <scsi/scsi_device.h>
-
-#define DRV_NAME "osdblk"
-#define PFX DRV_NAME ": "
-
-/* #define _OSDBLK_DEBUG */
-#ifdef _OSDBLK_DEBUG
-#define OSDBLK_DEBUG(fmt, a...) \
- printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)
-#else
-#define OSDBLK_DEBUG(fmt, a...) \
- do { if (0) printk(fmt, ##a); } while (0)
-#endif
-
-MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
-MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");
-MODULE_LICENSE("GPL");
-
-struct osdblk_device;
-
-enum {
- OSDBLK_MINORS_PER_MAJOR = 256, /* max minors per blkdev */
- OSDBLK_MAX_REQ = 32, /* max parallel requests */
- OSDBLK_OP_TIMEOUT = 4 * 60, /* sync OSD req timeout */
-};
-
-struct osdblk_request {
- struct request *rq; /* blk layer request */
- struct bio *bio; /* cloned bio */
- struct osdblk_device *osdev; /* associated blkdev */
-};
-
-struct osdblk_device {
- int id; /* blkdev unique id */
-
- int major; /* blkdev assigned major */
- struct gendisk *disk; /* blkdev's gendisk and rq */
- struct request_queue *q;
-
- struct osd_dev *osd; /* associated OSD */
-
- char name[32]; /* blkdev name, e.g. osdblk34 */
-
- spinlock_t lock; /* queue lock */
-
- struct osd_obj_id obj; /* OSD partition, obj id */
- uint8_t obj_cred[OSD_CAP_LEN]; /* OSD cred */
-
- struct osdblk_request req[OSDBLK_MAX_REQ]; /* request table */
-
- struct list_head node;
-
- char osd_path[0]; /* OSD device path */
-};
-
-static struct class *class_osdblk; /* /sys/class/osdblk */
-static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
-static LIST_HEAD(osdblkdev_list);
-
-static const struct block_device_operations osdblk_bd_ops = {
- .owner = THIS_MODULE,
-};
-
-static const struct osd_attr g_attr_logical_length = ATTR_DEF(
- OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
-
-static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN],
- const struct osd_obj_id *obj)
-{
- osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
-}
-
-/* copied from exofs; move to libosd? */
-/*
- * Perform a synchronous OSD operation. copied from exofs; move to libosd?
- */
-static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
-{
- int ret;
-
- or->timeout = timeout;
- ret = osd_finalize_request(or, 0, credential, NULL);
- if (ret)
- return ret;
-
- ret = osd_execute_request(or);
-
- /* osd_req_decode_sense(or, ret); */
- return ret;
-}
-
-/*
- * Perform an asynchronous OSD operation. copied from exofs; move to libosd?
- */
-static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
- void *caller_context, u8 *cred)
-{
- int ret;
-
- ret = osd_finalize_request(or, 0, cred, NULL);
- if (ret)
- return ret;
-
- ret = osd_execute_request_async(or, async_done, caller_context);
-
- return ret;
-}
-
-/* copied from exofs; move to libosd? */
-static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
-{
- struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
- void *iter = NULL;
- int nelem;
-
- do {
- nelem = 1;
- osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
- if ((cur_attr.attr_page == attr->attr_page) &&
- (cur_attr.attr_id == attr->attr_id)) {
- attr->len = cur_attr.len;
- attr->val_ptr = cur_attr.val_ptr;
- return 0;
- }
- } while (iter);
-
- return -EIO;
-}
-
-static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
-{
- struct osd_request *or;
- struct osd_attr attr;
- int ret;
-
- /* start request */
- or = osd_start_request(osdev->osd, GFP_KERNEL);
- if (!or)
- return -ENOMEM;
-
- /* create a get-attributes(length) request */
- osd_req_get_attributes(or, &osdev->obj);
-
- osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
-
- /* execute op synchronously */
- ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
- if (ret)
- goto out;
-
- /* extract length from returned attribute info */
- attr = g_attr_logical_length;
- ret = extract_attr_from_req(or, &attr);
- if (ret)
- goto out;
-
- *size_out = get_unaligned_be64(attr.val_ptr);
-
-out:
- osd_end_request(or);
- return ret;
-
-}
-
-static void osdblk_osd_complete(struct osd_request *or, void *private)
-{
- struct osdblk_request *orq = private;
- struct osd_sense_info osi;
- int ret = osd_req_decode_sense(or, &osi);
-
- if (ret) {
- ret = -EIO;
- OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret);
- }
-
- /* complete OSD request */
- osd_end_request(or);
-
- /* complete request passed to osdblk by block layer */
- __blk_end_request_all(orq->rq, ret);
-}
-
-static void bio_chain_put(struct bio *chain)
-{
- struct bio *tmp;
-
- while (chain) {
- tmp = chain;
- chain = chain->bi_next;
-
- bio_put(tmp);
- }
-}
-
-static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
-{
- struct bio *tmp, *new_chain = NULL, *tail = NULL;
-
- while (old_chain) {
- tmp = bio_clone_kmalloc(old_chain, gfpmask);
- if (!tmp)
- goto err_out;
-
- tmp->bi_bdev = NULL;
- gfpmask &= ~__GFP_DIRECT_RECLAIM;
- tmp->bi_next = NULL;
-
- if (!new_chain)
- new_chain = tail = tmp;
- else {
- tail->bi_next = tmp;
- tail = tmp;
- }
-
- old_chain = old_chain->bi_next;
- }
-
- return new_chain;
-
-err_out:
- OSDBLK_DEBUG("bio_chain_clone with err\n");
- bio_chain_put(new_chain);
- return NULL;
-}
-
-static void osdblk_rq_fn(struct request_queue *q)
-{
- struct osdblk_device *osdev = q->queuedata;
-
- while (1) {
- struct request *rq;
- struct osdblk_request *orq;
- struct osd_request *or;
- struct bio *bio;
- bool do_write, do_flush;
-
- /* peek at request from block layer */
- rq = blk_fetch_request(q);
- if (!rq)
- break;
-
- /* deduce our operation (read, write, flush) */
- /* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
- * into a clearly defined set of RPC commands:
- * read, write, flush, scsi command, power mgmt req,
- * driver-specific, etc.
- */
-
- do_flush = (req_op(rq) == REQ_OP_FLUSH);
- do_write = (rq_data_dir(rq) == WRITE);
-
- if (!do_flush) { /* osd_flush does not use a bio */
- /* a bio clone to be passed down to OSD request */
- bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
- if (!bio)
- break;
- } else
- bio = NULL;
-
- /* alloc internal OSD request, for OSD command execution */
- or = osd_start_request(osdev->osd, GFP_ATOMIC);
- if (!or) {
- bio_chain_put(bio);
- OSDBLK_DEBUG("osd_start_request with err\n");
- break;
- }
-
- orq = &osdev->req[rq->tag];
- orq->rq = rq;
- orq->bio = bio;
- orq->osdev = osdev;
-
- /* init OSD command: flush, write or read */
- if (do_flush)
- osd_req_flush_object(or, &osdev->obj,
- OSD_CDB_FLUSH_ALL, 0, 0);
- else if (do_write)
- osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
- bio, blk_rq_bytes(rq));
- else
- osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
- bio, blk_rq_bytes(rq));
-
- OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n",
- do_flush ? "flush" : do_write ?
- "write" : "read", blk_rq_bytes(rq),
- blk_rq_pos(rq) * 512ULL);
-
- /* begin OSD command execution */
- if (osd_async_op(or, osdblk_osd_complete, orq,
- osdev->obj_cred)) {
- osd_end_request(or);
- blk_requeue_request(q, rq);
- bio_chain_put(bio);
- OSDBLK_DEBUG("osd_execute_request_async with err\n");
- break;
- }
-
- /* remove the special 'flush' marker, now that the command
- * is executing
- */
- rq->special = NULL;
- }
-}
-
-static void osdblk_free_disk(struct osdblk_device *osdev)
-{
- struct gendisk *disk = osdev->disk;
-
- if (!disk)
- return;
-
- if (disk->flags & GENHD_FL_UP)
- del_gendisk(disk);
- if (disk->queue)
- blk_cleanup_queue(disk->queue);
- put_disk(disk);
-}
-
-static int osdblk_init_disk(struct osdblk_device *osdev)
-{
- struct gendisk *disk;
- struct request_queue *q;
- int rc;
- u64 obj_size = 0;
-
- /* contact OSD, request size info about the object being mapped */
- rc = osdblk_get_obj_size(osdev, &obj_size);
- if (rc)
- return rc;
-
- /* create gendisk info */
- disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
- if (!disk)
- return -ENOMEM;
-
- sprintf(disk->disk_name, DRV_NAME "%d", osdev->id);
- disk->major = osdev->major;
- disk->first_minor = 0;
- disk->fops = &osdblk_bd_ops;
- disk->private_data = osdev;
-
- /* init rq */
- q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
- if (!q) {
- put_disk(disk);
- return -ENOMEM;
- }
-
- /* switch queue to TCQ mode; allocate tag map */
- rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
- if (rc) {
- blk_cleanup_queue(q);
- put_disk(disk);
- return rc;
- }
-
- /* Set our limits to the lower device limits, because osdblk cannot
- * sleep when allocating a lower-request and therefore cannot be
- * bouncing.
- */
- blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
-
- blk_queue_prep_rq(q, blk_queue_start_tag);
- blk_queue_write_cache(q, true, false);
-
- disk->queue = q;
-
- q->queuedata = osdev;
-
- osdev->disk = disk;
- osdev->q = q;
-
- /* finally, announce the disk to the world */
- set_capacity(disk, obj_size / 512ULL);
- add_disk(disk);
-
- printk(KERN_INFO "%s: Added of size 0x%llx\n",
- disk->disk_name, (unsigned long long)obj_size);
-
- return 0;
-}
-
-/********************************************************************
- * /sys/class/osdblk/
- * add map OSD object to blkdev
- * remove unmap OSD object
- * list show mappings
- *******************************************************************/
-
-static void class_osdblk_release(struct class *cls)
-{
- kfree(cls);
-}
-
-static ssize_t class_osdblk_list(struct class *c,
- struct class_attribute *attr,
- char *data)
-{
- int n = 0;
- struct list_head *tmp;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &osdblkdev_list) {
- struct osdblk_device *osdev;
-
- osdev = list_entry(tmp, struct osdblk_device, node);
-
- n += sprintf(data+n, "%d %d %llu %llu %s\n",
- osdev->id,
- osdev->major,
- osdev->obj.partition,
- osdev->obj.id,
- osdev->osd_path);
- }
-
- mutex_unlock(&ctl_mutex);
- return n;
-}
-
-static ssize_t class_osdblk_add(struct class *c,
- struct class_attribute *attr,
- const char *buf, size_t count)
-{
- struct osdblk_device *osdev;
- ssize_t rc;
- int irc, new_id = 0;
- struct list_head *tmp;
-
- if (!try_module_get(THIS_MODULE))
- return -ENODEV;
-
- /* new osdblk_device object */
- osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
- if (!osdev) {
- rc = -ENOMEM;
- goto err_out_mod;
- }
-
- /* static osdblk_device initialization */
- spin_lock_init(&osdev->lock);
- INIT_LIST_HEAD(&osdev->node);
-
- /* generate unique id: find highest unique id, add one */
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &osdblkdev_list) {
- struct osdblk_device *osdev;
-
- osdev = list_entry(tmp, struct osdblk_device, node);
- if (osdev->id > new_id)
- new_id = osdev->id + 1;
- }
-
- osdev->id = new_id;
-
- /* add to global list */
- list_add_tail(&osdev->node, &osdblkdev_list);
-
- mutex_unlock(&ctl_mutex);
-
- /* parse add command */
- if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
- osdev->osd_path) != 3) {
- rc = -EINVAL;
- goto err_out_slot;
- }
-
- /* initialize rest of new object */
- sprintf(osdev->name, DRV_NAME "%d", osdev->id);
-
- /* contact requested OSD */
- osdev->osd = osduld_path_lookup(osdev->osd_path);
- if (IS_ERR(osdev->osd)) {
- rc = PTR_ERR(osdev->osd);
- goto err_out_slot;
- }
-
- /* build OSD credential */
- osdblk_make_credential(osdev->obj_cred, &osdev->obj);
-
- /* register our block device */
- irc = register_blkdev(0, osdev->name);
- if (irc < 0) {
- rc = irc;
- goto err_out_osd;
- }
-
- osdev->major = irc;
-
- /* set up and announce blkdev mapping */
- rc = osdblk_init_disk(osdev);
- if (rc)
- goto err_out_blkdev;
-
- return count;
-
-err_out_blkdev:
- unregister_blkdev(osdev->major, osdev->name);
-err_out_osd:
- osduld_put_device(osdev->osd);
-err_out_slot:
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- list_del_init(&osdev->node);
- mutex_unlock(&ctl_mutex);
-
- kfree(osdev);
-err_out_mod:
- OSDBLK_DEBUG("Error adding device %s\n", buf);
- module_put(THIS_MODULE);
- return rc;
-}
-
-static ssize_t class_osdblk_remove(struct class *c,
- struct class_attribute *attr,
- const char *buf,
- size_t count)
-{
- struct osdblk_device *osdev = NULL;
- int target_id, rc;
- unsigned long ul;
- struct list_head *tmp;
-
- rc = kstrtoul(buf, 10, &ul);
- if (rc)
- return rc;
-
- /* convert to int; abort if we lost anything in the conversion */
- target_id = (int) ul;
- if (target_id != ul)
- return -EINVAL;
-
- /* remove object from list immediately */
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- list_for_each(tmp, &osdblkdev_list) {
- osdev = list_entry(tmp, struct osdblk_device, node);
- if (osdev->id == target_id) {
- list_del_init(&osdev->node);
- break;
- }
- osdev = NULL;
- }
-
- mutex_unlock(&ctl_mutex);
-
- if (!osdev)
- return -ENOENT;
-
- /* clean up and free blkdev and associated OSD connection */
- osdblk_free_disk(osdev);
- unregister_blkdev(osdev->major, osdev->name);
- osduld_put_device(osdev->osd);
- kfree(osdev);
-
- /* release module ref */
- module_put(THIS_MODULE);
-
- return count;
-}
-
-static struct class_attribute class_osdblk_attrs[] = {
- __ATTR(add, 0200, NULL, class_osdblk_add),
- __ATTR(remove, 0200, NULL, class_osdblk_remove),
- __ATTR(list, 0444, class_osdblk_list, NULL),
- __ATTR_NULL
-};
-
-static int osdblk_sysfs_init(void)
-{
- int ret = 0;
-
- /*
- * create control files in sysfs
- * /sys/class/osdblk/...
- */
- class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
- if (!class_osdblk)
- return -ENOMEM;
-
- class_osdblk->name = DRV_NAME;
- class_osdblk->owner = THIS_MODULE;
- class_osdblk->class_release = class_osdblk_release;
- class_osdblk->class_attrs = class_osdblk_attrs;
-
- ret = class_register(class_osdblk);
- if (ret) {
- kfree(class_osdblk);
- class_osdblk = NULL;
- printk(PFX "failed to create class osdblk\n");
- return ret;
- }
-
- return 0;
-}
-
-static void osdblk_sysfs_cleanup(void)
-{
- if (class_osdblk)
- class_destroy(class_osdblk);
- class_osdblk = NULL;
-}
-
-static int __init osdblk_init(void)
-{
- int rc;
-
- rc = osdblk_sysfs_init();
- if (rc)
- return rc;
-
- return 0;
-}
-
-static void __exit osdblk_exit(void)
-{
- osdblk_sysfs_cleanup();
-}
-
-module_init(osdblk_init);
-module_exit(osdblk_exit);
-
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 939641d6e2625..b1267ef34d5a7 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -300,6 +300,11 @@ static void pcd_init_units(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
continue;
+ disk->queue = blk_init_queue(do_pcd_request, &pcd_lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ continue;
+ }
cd->disk = disk;
cd->pi = &cd->pia;
cd->present = 0;
@@ -735,18 +740,36 @@ static int pcd_detect(void)
}
/* I/O request processing */
-static struct request_queue *pcd_queue;
+static int pcd_queue;
+
+static int set_next_request(void)
+{
+ struct pcd_unit *cd;
+ struct request_queue *q;
+ int old_pos = pcd_queue;
+
+ do {
+ cd = &pcd[pcd_queue];
+ q = cd->present ? cd->disk->queue : NULL;
+ if (++pcd_queue == PCD_UNITS)
+ pcd_queue = 0;
+ if (q) {
+ pcd_req = blk_fetch_request(q);
+ if (pcd_req)
+ break;
+ }
+ } while (pcd_queue != old_pos);
+
+ return pcd_req != NULL;
+}
-static void do_pcd_request(struct request_queue * q)
+static void pcd_request(void)
{
if (pcd_busy)
return;
while (1) {
- if (!pcd_req) {
- pcd_req = blk_fetch_request(q);
- if (!pcd_req)
- return;
- }
+ if (!pcd_req && !set_next_request())
+ return;
if (rq_data_dir(pcd_req) == READ) {
struct pcd_unit *cd = pcd_req->rq_disk->private_data;
@@ -766,6 +789,11 @@ static void do_pcd_request(struct request_queue * q)
}
}
+static void do_pcd_request(struct request_queue *q)
+{
+ pcd_request();
+}
+
static inline void next_request(int err)
{
unsigned long saved_flags;
@@ -774,7 +802,7 @@ static inline void next_request(int err)
if (!__blk_end_request_cur(pcd_req, err))
pcd_req = NULL;
pcd_busy = 0;
- do_pcd_request(pcd_queue);
+ pcd_request();
spin_unlock_irqrestore(&pcd_lock, saved_flags);
}
@@ -849,7 +877,7 @@ static void do_pcd_read_drq(void)
do_pcd_read();
spin_lock_irqsave(&pcd_lock, saved_flags);
- do_pcd_request(pcd_queue);
+ pcd_request();
spin_unlock_irqrestore(&pcd_lock, saved_flags);
}
@@ -957,19 +985,10 @@ static int __init pcd_init(void)
return -EBUSY;
}
- pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock);
- if (!pcd_queue) {
- unregister_blkdev(major, name);
- for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
- put_disk(cd->disk);
- return -ENOMEM;
- }
-
for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
if (cd->present) {
register_cdrom(&cd->info);
cd->disk->private_data = cd;
- cd->disk->queue = pcd_queue;
add_disk(cd->disk);
}
}
@@ -988,9 +1007,9 @@ static void __exit pcd_exit(void)
pi_release(cd->pi);
unregister_cdrom(&cd->info);
}
+ blk_cleanup_queue(cd->disk->queue);
put_disk(cd->disk);
}
- blk_cleanup_queue(pcd_queue);
unregister_blkdev(major, name);
pi_unregister_driver(par_drv);
}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9cfd2e06a6491..7d2402f909789 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -381,12 +381,33 @@ static enum action do_pd_write_start(void);
static enum action do_pd_read_drq(void);
static enum action do_pd_write_done(void);
-static struct request_queue *pd_queue;
+static int pd_queue;
static int pd_claimed;
static struct pd_unit *pd_current; /* current request's drive */
static PIA *pi_current; /* current request's PIA */
+static int set_next_request(void)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int old_pos = pd_queue;
+
+ do {
+ disk = pd[pd_queue].gd;
+ q = disk ? disk->queue : NULL;
+ if (++pd_queue == PD_UNITS)
+ pd_queue = 0;
+ if (q) {
+ pd_req = blk_fetch_request(q);
+ if (pd_req)
+ break;
+ }
+ } while (pd_queue != old_pos);
+
+ return pd_req != NULL;
+}
+
static void run_fsm(void)
{
while (1) {
@@ -418,8 +439,7 @@ static void run_fsm(void)
spin_lock_irqsave(&pd_lock, saved_flags);
if (!__blk_end_request_cur(pd_req,
res == Ok ? 0 : -EIO)) {
- pd_req = blk_fetch_request(pd_queue);
- if (!pd_req)
+ if (!set_next_request())
stop = 1;
}
spin_unlock_irqrestore(&pd_lock, saved_flags);
@@ -719,18 +739,15 @@ static int pd_special_command(struct pd_unit *disk,
enum action (*func)(struct pd_unit *disk))
{
struct request *rq;
- int err = 0;
rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
if (IS_ERR(rq))
return PTR_ERR(rq);
rq->special = func;
-
- err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
-
+ blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
blk_put_request(rq);
- return err;
+ return 0;
}
/* kernel glue structures */
@@ -839,7 +856,13 @@ static void pd_probe_drive(struct pd_unit *disk)
p->first_minor = (disk - pd) << PD_BITS;
disk->gd = p;
p->private_data = disk;
- p->queue = pd_queue;
+ p->queue = blk_init_queue(do_pd_request, &pd_lock);
+ if (!p->queue) {
+ disk->gd = NULL;
+ put_disk(p);
+ return;
+ }
+ blk_queue_max_hw_sectors(p->queue, cluster);
if (disk->drive == -1) {
for (disk->drive = 0; disk->drive <= 1; disk->drive++)
@@ -919,26 +942,18 @@ static int __init pd_init(void)
if (disable)
goto out1;
- pd_queue = blk_init_queue(do_pd_request, &pd_lock);
- if (!pd_queue)
- goto out1;
-
- blk_queue_max_hw_sectors(pd_queue, cluster);
-
if (register_blkdev(major, name))
- goto out2;
+ goto out1;
printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
name, name, PD_VERSION, major, cluster, nice);
if (!pd_detect())
- goto out3;
+ goto out2;
return 0;
-out3:
- unregister_blkdev(major, name);
out2:
- blk_cleanup_queue(pd_queue);
+ unregister_blkdev(major, name);
out1:
return -ENODEV;
}
@@ -953,11 +968,11 @@ static void __exit pd_exit(void)
if (p) {
disk->gd = NULL;
del_gendisk(p);
+ blk_cleanup_queue(p->queue);
put_disk(p);
pi_release(disk->pi);
}
}
- blk_cleanup_queue(pd_queue);
}
MODULE_LICENSE("GPL");
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 14c5d32f5d8bc..f24ca7315ddc9 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -287,6 +287,12 @@ static void __init pf_init_units(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
continue;
+ disk->queue = blk_init_queue(do_pf_request, &pf_spin_lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ return;
+ }
+ blk_queue_max_segments(disk->queue, cluster);
pf->disk = disk;
pf->pi = &pf->pia;
pf->media_status = PF_NM;
@@ -772,7 +778,28 @@ static int pf_ready(void)
return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask));
}
-static struct request_queue *pf_queue;
+static int pf_queue;
+
+static int set_next_request(void)
+{
+ struct pf_unit *pf;
+ struct request_queue *q;
+ int old_pos = pf_queue;
+
+ do {
+ pf = &units[pf_queue];
+ q = pf->present ? pf->disk->queue : NULL;
+ if (++pf_queue == PF_UNITS)
+ pf_queue = 0;
+ if (q) {
+ pf_req = blk_fetch_request(q);
+ if (pf_req)
+ break;
+ }
+ } while (pf_queue != old_pos);
+
+ return pf_req != NULL;
+}
static void pf_end_request(int err)
{
@@ -780,16 +807,13 @@ static void pf_end_request(int err)
pf_req = NULL;
}
-static void do_pf_request(struct request_queue * q)
+static void pf_request(void)
{
if (pf_busy)
return;
repeat:
- if (!pf_req) {
- pf_req = blk_fetch_request(q);
- if (!pf_req)
- return;
- }
+ if (!pf_req && !set_next_request())
+ return;
pf_current = pf_req->rq_disk->private_data;
pf_block = blk_rq_pos(pf_req);
@@ -817,6 +841,11 @@ repeat:
}
}
+static void do_pf_request(struct request_queue *q)
+{
+ pf_request();
+}
+
static int pf_next_buf(void)
{
unsigned long saved_flags;
@@ -846,7 +875,7 @@ static inline void next_request(int err)
spin_lock_irqsave(&pf_spin_lock, saved_flags);
pf_end_request(err);
pf_busy = 0;
- do_pf_request(pf_queue);
+ pf_request();
spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
}
@@ -972,15 +1001,6 @@ static int __init pf_init(void)
put_disk(pf->disk);
return -EBUSY;
}
- pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock);
- if (!pf_queue) {
- unregister_blkdev(major, name);
- for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
- put_disk(pf->disk);
- return -ENOMEM;
- }
-
- blk_queue_max_segments(pf_queue, cluster);
for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
struct gendisk *disk = pf->disk;
@@ -988,7 +1008,6 @@ static int __init pf_init(void)
if (!pf->present)
continue;
disk->private_data = pf;
- disk->queue = pf_queue;
add_disk(disk);
}
return 0;
@@ -1003,10 +1022,10 @@ static void __exit pf_exit(void)
if (!pf->present)
continue;
del_gendisk(pf->disk);
+ blk_cleanup_queue(pf->disk->queue);
put_disk(pf->disk);
pi_release(pf->pi);
}
- blk_cleanup_queue(pf_queue);
}
MODULE_LICENSE("GPL");
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 66d846ba85a97..205b865ebeb9f 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -724,7 +724,7 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
rq->rq_flags |= RQF_QUIET;
blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
- if (rq->errors)
+ if (scsi_req(rq)->result)
ret = -EIO;
out:
blk_put_request(rq);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 517838b659646..089ac4179919d 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4317,7 +4317,7 @@ static int rbd_init_request(void *data, struct request *rq,
return 0;
}
-static struct blk_mq_ops rbd_mq_ops = {
+static const struct blk_mq_ops rbd_mq_ops = {
.queue_rq = rbd_queue_rq,
.init_request = rbd_init_request,
};
@@ -4380,7 +4380,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
q->limits.discard_granularity = segment_size;
q->limits.discard_alignment = segment_size;
blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
- q->limits.discard_zeroes_data = 1;
if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index f81d70b39d109..9c566364ac9c3 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -300,7 +300,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
RSXX_HW_BLK_SIZE >> 9);
card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE;
- card->queue->limits.discard_zeroes_data = 1;
}
card->queue->queuedata = card;
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index b5afd495d482e..3064be6cf3755 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -211,7 +211,7 @@ enum head {
struct swim_priv {
struct swim __iomem *base;
spinlock_t lock;
- struct request_queue *queue;
+ int fdc_queue;
int floppy_count;
struct floppy_state unit[FD_MAX_UNIT];
};
@@ -525,12 +525,33 @@ static int floppy_read_sectors(struct floppy_state *fs,
return 0;
}
-static void redo_fd_request(struct request_queue *q)
+static struct request *swim_next_request(struct swim_priv *swd)
{
+ struct request_queue *q;
+ struct request *rq;
+ int old_pos = swd->fdc_queue;
+
+ do {
+ q = swd->unit[swd->fdc_queue].disk->queue;
+ if (++swd->fdc_queue == swd->floppy_count)
+ swd->fdc_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ return rq;
+ }
+ } while (swd->fdc_queue != old_pos);
+
+ return NULL;
+}
+
+static void do_fd_request(struct request_queue *q)
+{
+ struct swim_priv *swd = q->queuedata;
struct request *req;
struct floppy_state *fs;
- req = blk_fetch_request(q);
+ req = swim_next_request(swd);
while (req) {
int err = -EIO;
@@ -554,15 +575,10 @@ static void redo_fd_request(struct request_queue *q)
}
done:
if (!__blk_end_request_cur(req, err))
- req = blk_fetch_request(q);
+ req = swim_next_request(swd);
}
}
-static void do_fd_request(struct request_queue *q)
-{
- redo_fd_request(q);
-}
-
static struct floppy_struct floppy_type[4] = {
{ 0, 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing */
{ 720, 9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/
@@ -833,22 +849,25 @@ static int swim_floppy_init(struct swim_priv *swd)
return -EBUSY;
}
+ spin_lock_init(&swd->lock);
+
for (drive = 0; drive < swd->floppy_count; drive++) {
swd->unit[drive].disk = alloc_disk(1);
if (swd->unit[drive].disk == NULL) {
err = -ENOMEM;
goto exit_put_disks;
}
+ swd->unit[drive].disk->queue = blk_init_queue(do_fd_request,
+ &swd->lock);
+ if (!swd->unit[drive].disk->queue) {
+ err = -ENOMEM;
+ put_disk(swd->unit[drive].disk);
+ goto exit_put_disks;
+ }
+ swd->unit[drive].disk->queue->queuedata = swd;
swd->unit[drive].swd = swd;
}
- spin_lock_init(&swd->lock);
- swd->queue = blk_init_queue(do_fd_request, &swd->lock);
- if (!swd->queue) {
- err = -ENOMEM;
- goto exit_put_disks;
- }
-
for (drive = 0; drive < swd->floppy_count; drive++) {
swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE;
swd->unit[drive].disk->major = FLOPPY_MAJOR;
@@ -856,7 +875,6 @@ static int swim_floppy_init(struct swim_priv *swd)
sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
swd->unit[drive].disk->fops = &floppy_fops;
swd->unit[drive].disk->private_data = &swd->unit[drive];
- swd->unit[drive].disk->queue = swd->queue;
set_capacity(swd->unit[drive].disk, 2880);
add_disk(swd->unit[drive].disk);
}
@@ -943,13 +961,12 @@ static int swim_remove(struct platform_device *dev)
for (drive = 0; drive < swd->floppy_count; drive++) {
del_gendisk(swd->unit[drive].disk);
+ blk_cleanup_queue(swd->unit[drive].disk->queue);
put_disk(swd->unit[drive].disk);
}
unregister_blkdev(FLOPPY_MAJOR, "fd");
- blk_cleanup_queue(swd->queue);
-
/* eject floppies */
for (drive = 0; drive < swd->floppy_count; drive++)
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 61b3ffa4f4589..ba4809c9bdbad 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -343,8 +343,8 @@ static void start_request(struct floppy_state *fs)
req->rq_disk->disk_name, req->cmd,
(long)blk_rq_pos(req), blk_rq_sectors(req),
bio_data(req->bio));
- swim3_dbg(" errors=%d current_nr_sectors=%u\n",
- req->errors, blk_rq_cur_sectors(req));
+ swim3_dbg(" current_nr_sectors=%u\n",
+ blk_rq_cur_sectors(req));
#endif
if (blk_rq_pos(req) >= fs->total_secs) {
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1d4c9f8bc1e16..f94614257462c 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -111,7 +111,7 @@ static int virtblk_add_req_scsi(struct virtqueue *vq, struct virtblk_req *vbr,
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
{
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
struct virtio_blk *vblk = req->q->queuedata;
@@ -119,7 +119,7 @@ static inline void virtblk_scsi_reques_done(struct request *req)
sreq->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
sreq->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
- req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
+ sreq->result = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
}
static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
@@ -144,7 +144,7 @@ static inline int virtblk_add_req_scsi(struct virtqueue *vq,
{
return -EIO;
}
-static inline void virtblk_scsi_reques_done(struct request *req)
+static inline void virtblk_scsi_request_done(struct request *req)
{
}
#define virtblk_ioctl NULL
@@ -175,19 +175,15 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr,
static inline void virtblk_request_done(struct request *req)
{
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
- int error = virtblk_result(vbr);
switch (req_op(req)) {
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
- virtblk_scsi_reques_done(req);
- break;
- case REQ_OP_DRV_IN:
- req->errors = (error != 0);
+ virtblk_scsi_request_done(req);
break;
}
- blk_mq_end_request(req, error);
+ blk_mq_end_request(req, virtblk_result(vbr));
}
static void virtblk_done(struct virtqueue *vq)
@@ -205,7 +201,7 @@ static void virtblk_done(struct virtqueue *vq)
while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
struct request *req = blk_mq_rq_from_pdu(vbr);
- blk_mq_complete_request(req, req->errors);
+ blk_mq_complete_request(req);
req_done = true;
}
if (unlikely(virtqueue_is_broken(vq)))
@@ -310,7 +306,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
if (err)
goto out;
- err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+ blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+ err = virtblk_result(blk_mq_rq_to_pdu(req));
out:
blk_put_request(req);
return err;
@@ -597,7 +594,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
}
-static struct blk_mq_ops virtio_mq_ops = {
+static const struct blk_mq_ops virtio_mq_ops = {
.queue_rq = virtio_queue_rq,
.complete = virtblk_request_done,
.init_request = virtblk_init_request,
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 5067a0a952cb2..39459631667cc 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -115,6 +115,15 @@ struct split_bio {
atomic_t pending;
};
+struct blkif_req {
+ int error;
+};
+
+static inline struct blkif_req *blkif_req(struct request *rq)
+{
+ return blk_mq_rq_to_pdu(rq);
+}
+
static DEFINE_MUTEX(blkfront_mutex);
static const struct block_device_operations xlvbd_block_fops;
@@ -907,8 +916,14 @@ out_busy:
return BLK_MQ_RQ_QUEUE_BUSY;
}
-static struct blk_mq_ops blkfront_mq_ops = {
+static void blkif_complete_rq(struct request *rq)
+{
+ blk_mq_end_request(rq, blkif_req(rq)->error);
+}
+
+static const struct blk_mq_ops blkfront_mq_ops = {
.queue_rq = blkif_queue_rq,
+ .complete = blkif_complete_rq,
};
static void blkif_set_queue_limits(struct blkfront_info *info)
@@ -969,7 +984,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
- info->tag_set.cmd_size = 0;
+ info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;
if (blk_mq_alloc_tag_set(&info->tag_set))
@@ -1543,7 +1558,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
unsigned long flags;
struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
struct blkfront_info *info = rinfo->dev_info;
- int error;
if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
return IRQ_HANDLED;
@@ -1587,37 +1601,36 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
continue;
}
- error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+ blkif_req(req)->error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
switch (bret->operation) {
case BLKIF_OP_DISCARD:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
struct request_queue *rq = info->rq;
printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ blkif_req(req)->error = -EOPNOTSUPP;
info->feature_discard = 0;
info->feature_secdiscard = 0;
queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
}
- blk_mq_complete_request(req, error);
break;
case BLKIF_OP_FLUSH_DISKCACHE:
case BLKIF_OP_WRITE_BARRIER:
if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
printk(KERN_WARNING "blkfront: %s: %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ blkif_req(req)->error = -EOPNOTSUPP;
}
if (unlikely(bret->status == BLKIF_RSP_ERROR &&
rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
info->gd->disk_name, op_name(bret->operation));
- error = -EOPNOTSUPP;
+ blkif_req(req)->error = -EOPNOTSUPP;
}
- if (unlikely(error)) {
- if (error == -EOPNOTSUPP)
- error = 0;
+ if (unlikely(blkif_req(req)->error)) {
+ if (blkif_req(req)->error == -EOPNOTSUPP)
+ blkif_req(req)->error = 0;
info->feature_fua = 0;
info->feature_flush = 0;
xlvbd_flush(info);
@@ -1629,11 +1642,12 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
"request: %x\n", bret->status);
- blk_mq_complete_request(req, error);
break;
default:
BUG();
}
+
+ blk_mq_complete_request(req);
}
rinfo->ring.rsp_cons = i;
@@ -2345,6 +2359,7 @@ static void blkfront_connect(struct blkfront_info *info)
unsigned long sector_size;
unsigned int physical_sector_size;
unsigned int binfo;
+ char *envp[] = { "RESIZE=1", NULL };
int err, i;
switch (info->connected) {
@@ -2361,6 +2376,8 @@ static void blkfront_connect(struct blkfront_info *info)
sectors);
set_capacity(info->gd, sectors);
revalidate_disk(info->gd);
+ kobject_uevent_env(&disk_to_dev(info->gd)->kobj,
+ KOBJ_CHANGE, envp);
return;
case BLKIF_STATE_SUSPENDED:
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0c09d42561081..6fac5fedd6107 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -829,10 +829,14 @@ static void __zram_make_request(struct zram *zram, struct bio *bio)
offset = (bio->bi_iter.bi_sector &
(SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
- if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
zram_bio_discard(zram, index, offset, bio);
bio_endio(bio);
return;
+ default:
+ break;
}
bio_for_each_segment(bvec, bio, iter) {
@@ -1192,6 +1196,8 @@ static int zram_add(void)
zram->disk->queue->limits.max_sectors = SECTORS_PER_PAGE;
zram->disk->queue->limits.chunk_sectors = 0;
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+
/*
* zram_bio_discard() will clear all logical blocks if logical block
* size is identical with physical block size(PAGE_SIZE). But if it is
@@ -1201,10 +1207,7 @@ static int zram_add(void)
* zeroed.
*/
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
- zram->disk->queue->limits.discard_zeroes_data = 1;
- else
- zram->disk->queue->limits.discard_zeroes_data = 0;
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+ blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
add_disk(zram->disk);
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 87739649eac21..76c952fd9ab90 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2218,7 +2218,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
rq->timeout = 60 * HZ;
bio = rq->bio;
- if (blk_execute_rq(q, cdi->disk, rq, 0)) {
+ blk_execute_rq(q, cdi->disk, rq, 0);
+ if (scsi_req(rq)->result) {
struct request_sense *s = req->sense;
ret = -EIO;
cdi->last_sense = s->sense_key;
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index feb30061123bc..5901937284e70 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -107,7 +107,8 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
memcpy(scsi_req(rq)->cmd, pc->c, 12);
if (drive->media == ide_tape)
scsi_req(rq)->cmd[13] = REQ_IDETAPE_PC1;
- error = blk_execute_rq(drive->queue, disk, rq, 0);
+ blk_execute_rq(drive->queue, disk, rq, 0);
+ error = scsi_req(rq)->result ? -EIO : 0;
put_req:
blk_put_request(rq);
return error;
@@ -454,7 +455,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
debug_log("%s: I/O error\n", drive->name);
if (drive->media != ide_tape)
- pc->rq->errors++;
+ scsi_req(pc->rq)->result++;
if (scsi_req(rq)->cmd[0] == REQUEST_SENSE) {
printk(KERN_ERR PFX "%s: I/O error in request "
@@ -488,13 +489,13 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
drive->failed_pc = NULL;
if (ata_misc_request(rq)) {
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
error = 0;
} else {
if (blk_rq_is_passthrough(rq) && uptodate <= 0) {
- if (rq->errors == 0)
- rq->errors = -EIO;
+ if (scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
}
error = uptodate ? 0 : -EIO;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 74f1b7dc03f73..07e5ff3a64c33 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -247,10 +247,10 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
struct cdrom_info *info = drive->driver_data;
- if (!rq->errors)
+ if (!scsi_req(rq)->result)
info->write_timeout = jiffies + ATAPI_WAIT_WRITE_BUSY;
- rq->errors = 1;
+ scsi_req(rq)->result = 1;
if (time_after(jiffies, info->write_timeout))
return 0;
@@ -294,8 +294,8 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
}
/* if we have an error, pass CHECK_CONDITION as the SCSI status byte */
- if (blk_rq_is_scsi(rq) && !rq->errors)
- rq->errors = SAM_STAT_CHECK_CONDITION;
+ if (blk_rq_is_scsi(rq) && !scsi_req(rq)->result)
+ scsi_req(rq)->result = SAM_STAT_CHECK_CONDITION;
if (blk_noretry_request(rq))
do_end_request = 1;
@@ -325,7 +325,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
* Arrange to retry the request but be sure to give up if we've
* retried too many times.
*/
- if (++rq->errors > ERROR_MAX)
+ if (++scsi_req(rq)->result > ERROR_MAX)
do_end_request = 1;
break;
case ILLEGAL_REQUEST:
@@ -372,7 +372,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
/* go to the default handler for other errors */
ide_error(drive, "cdrom_decode_status", stat);
return 1;
- } else if (++rq->errors > ERROR_MAX)
+ } else if (++scsi_req(rq)->result > ERROR_MAX)
/* we've racked up too many retries, abort */
do_end_request = 1;
}
@@ -452,7 +452,8 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
}
}
- error = blk_execute_rq(drive->queue, info->disk, rq, 0);
+ blk_execute_rq(drive->queue, info->disk, rq, 0);
+ error = scsi_req(rq)->result ? -EIO : 0;
if (buffer)
*bufflen = scsi_req(rq)->resid_len;
@@ -683,8 +684,8 @@ out_end:
if (cmd->nleft == 0)
uptodate = 1;
} else {
- if (uptodate <= 0 && rq->errors == 0)
- rq->errors = -EIO;
+ if (uptodate <= 0 && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
}
if (uptodate == 0 && rq->bio)
@@ -1379,7 +1380,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
* appropriate action
*/
if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
- rq->errors = ILLEGAL_REQUEST;
+ scsi_req(rq)->result = ILLEGAL_REQUEST;
return BLKPREP_KILL;
}
diff --git a/drivers/ide/ide-cd_ioctl.c b/drivers/ide/ide-cd_ioctl.c
index 9fcefbc8425e7..55cd736c39c67 100644
--- a/drivers/ide/ide-cd_ioctl.c
+++ b/drivers/ide/ide-cd_ioctl.c
@@ -307,7 +307,8 @@ int ide_cdrom_reset(struct cdrom_device_info *cdi)
scsi_req_init(rq);
ide_req(rq)->type = ATA_PRIV_MISC;
rq->rq_flags = RQF_QUIET;
- ret = blk_execute_rq(drive->queue, cd->disk, rq, 0);
+ blk_execute_rq(drive->queue, cd->disk, rq, 0);
+ ret = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
/*
* A reset will unlock the door. If it was previously locked,
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index a45dda5386e44..9b69c32ee5601 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -173,8 +173,8 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
*(int *)&scsi_req(rq)->cmd[1] = arg;
rq->special = setting->set;
- if (blk_execute_rq(q, NULL, rq, 0))
- ret = rq->errors;
+ blk_execute_rq(q, NULL, rq, 0);
+ ret = scsi_req(rq)->result;
blk_put_request(rq);
return ret;
@@ -186,7 +186,7 @@ ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
if (err)
- rq->errors = err;
- ide_complete_rq(drive, err, blk_rq_bytes(rq));
+ scsi_req(rq)->result = err;
+ ide_complete_rq(drive, 0, blk_rq_bytes(rq));
return ide_stopped;
}
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 186159715b71c..7c06237f34795 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -470,7 +470,6 @@ ide_devset_get(multcount, mult_count);
static int set_multcount(ide_drive_t *drive, int arg)
{
struct request *rq;
- int error;
if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
return -EINVAL;
@@ -484,7 +483,7 @@ static int set_multcount(ide_drive_t *drive, int arg)
drive->mult_req = arg;
drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
- error = blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(drive->queue, NULL, rq, 0);
blk_put_request(rq);
return (drive->mult_count == arg) ? 0 : -EIO;
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index 17a65ac564918..51c81223e56d0 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -490,7 +490,7 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
* make sure request is sane
*/
if (hwif->rq)
- hwif->rq->errors = 0;
+ scsi_req(hwif->rq)->result = 0;
return ret;
}
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index cf3af68403689..4b7ffd7d158dc 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -12,7 +12,7 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
if ((stat & ATA_BUSY) ||
((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
/* other bits are useless when BUSY */
- rq->errors |= ERROR_RESET;
+ scsi_req(rq)->result |= ERROR_RESET;
} else if (stat & ATA_ERR) {
/* err has different meaning on cdrom and tape */
if (err == ATA_ABORTED) {
@@ -25,10 +25,10 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
drive->crc_count++;
} else if (err & (ATA_BBK | ATA_UNC)) {
/* retries won't help these */
- rq->errors = ERROR_MAX;
+ scsi_req(rq)->result = ERROR_MAX;
} else if (err & ATA_TRK0NF) {
/* help it find track zero */
- rq->errors |= ERROR_RECAL;
+ scsi_req(rq)->result |= ERROR_RECAL;
}
}
@@ -39,23 +39,23 @@ static ide_startstop_t ide_ata_error(ide_drive_t *drive, struct request *rq,
ide_pad_transfer(drive, READ, nsect * SECTOR_SIZE);
}
- if (rq->errors >= ERROR_MAX || blk_noretry_request(rq)) {
+ if (scsi_req(rq)->result >= ERROR_MAX || blk_noretry_request(rq)) {
ide_kill_rq(drive, rq);
return ide_stopped;
}
if (hwif->tp_ops->read_status(hwif) & (ATA_BUSY | ATA_DRQ))
- rq->errors |= ERROR_RESET;
+ scsi_req(rq)->result |= ERROR_RESET;
- if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
- ++rq->errors;
+ if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+ ++scsi_req(rq)->result;
return ide_do_reset(drive);
}
- if ((rq->errors & ERROR_RECAL) == ERROR_RECAL)
+ if ((scsi_req(rq)->result & ERROR_RECAL) == ERROR_RECAL)
drive->special_flags |= IDE_SFLAG_RECALIBRATE;
- ++rq->errors;
+ ++scsi_req(rq)->result;
return ide_stopped;
}
@@ -68,7 +68,7 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
if ((stat & ATA_BUSY) ||
((stat & ATA_DF) && (drive->dev_flags & IDE_DFLAG_NOWERR) == 0)) {
/* other bits are useless when BUSY */
- rq->errors |= ERROR_RESET;
+ scsi_req(rq)->result |= ERROR_RESET;
} else {
/* add decoding error stuff */
}
@@ -77,14 +77,14 @@ static ide_startstop_t ide_atapi_error(ide_drive_t *drive, struct request *rq,
/* force an abort */
hwif->tp_ops->exec_command(hwif, ATA_CMD_IDLEIMMEDIATE);
- if (rq->errors >= ERROR_MAX) {
+ if (scsi_req(rq)->result >= ERROR_MAX) {
ide_kill_rq(drive, rq);
} else {
- if ((rq->errors & ERROR_RESET) == ERROR_RESET) {
- ++rq->errors;
+ if ((scsi_req(rq)->result & ERROR_RESET) == ERROR_RESET) {
+ ++scsi_req(rq)->result;
return ide_do_reset(drive);
}
- ++rq->errors;
+ ++scsi_req(rq)->result;
}
return ide_stopped;
@@ -130,11 +130,11 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
if (cmd)
ide_complete_cmd(drive, cmd, stat, err);
} else if (ata_pm_request(rq)) {
- rq->errors = 1;
+ scsi_req(rq)->result = 1;
ide_complete_pm_rq(drive, rq);
return ide_stopped;
}
- rq->errors = err;
+ scsi_req(rq)->result = err;
ide_complete_rq(drive, err ? -EIO : 0, blk_rq_bytes(rq));
return ide_stopped;
}
@@ -149,8 +149,8 @@ static inline void ide_complete_drive_reset(ide_drive_t *drive, int err)
if (rq && ata_misc_request(rq) &&
scsi_req(rq)->cmd[0] == REQ_DRIVE_RESET) {
- if (err <= 0 && rq->errors == 0)
- rq->errors = -EIO;
+ if (err <= 0 && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
ide_complete_rq(drive, err ? err : 0, blk_rq_bytes(rq));
}
}
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a69e8013f1dff..8ac6048cd2df9 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -98,7 +98,7 @@ static int ide_floppy_callback(ide_drive_t *drive, int dsc)
}
if (ata_misc_request(rq))
- rq->errors = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
+ scsi_req(rq)->result = uptodate ? 0 : IDE_DRV_ERROR_GENERAL;
return uptodate;
}
@@ -239,7 +239,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
? rq->rq_disk->disk_name
: "dev?"));
- if (rq->errors >= ERROR_MAX) {
+ if (scsi_req(rq)->result >= ERROR_MAX) {
if (drive->failed_pc) {
ide_floppy_report_error(floppy, drive->failed_pc);
drive->failed_pc = NULL;
@@ -247,7 +247,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
printk(KERN_ERR PFX "%s: I/O error\n", drive->name);
if (ata_misc_request(rq)) {
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
ide_complete_rq(drive, 0, blk_rq_bytes(rq));
return ide_stopped;
} else
@@ -301,8 +301,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
return ide_floppy_issue_pc(drive, &cmd, pc);
out_end:
drive->failed_pc = NULL;
- if (blk_rq_is_passthrough(rq) && rq->errors == 0)
- rq->errors = -EIO;
+ if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
return ide_stopped;
}
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 043b1fb963cb8..45b3f41a43d41 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -141,12 +141,12 @@ void ide_kill_rq(ide_drive_t *drive, struct request *rq)
drive->failed_pc = NULL;
if ((media == ide_floppy || media == ide_tape) && drv_req) {
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
} else {
if (media == ide_tape)
- rq->errors = IDE_DRV_ERROR_GENERAL;
- else if (blk_rq_is_passthrough(rq) && rq->errors == 0)
- rq->errors = -EIO;
+ scsi_req(rq)->result = IDE_DRV_ERROR_GENERAL;
+ else if (blk_rq_is_passthrough(rq) && scsi_req(rq)->result == 0)
+ scsi_req(rq)->result = -EIO;
}
ide_complete_rq(drive, -EIO, blk_rq_bytes(rq));
@@ -271,7 +271,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
#ifdef DEBUG
printk("%s: DRIVE_CMD (null)\n", drive->name);
#endif
- rq->errors = 0;
+ scsi_req(rq)->result = 0;
ide_complete_rq(drive, 0, blk_rq_bytes(rq));
return ide_stopped;
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index 248a3e0ceb468..8c0d17297a7a0 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -128,7 +128,8 @@ static int ide_cmd_ioctl(ide_drive_t *drive, unsigned long arg)
rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM);
scsi_req_init(rq);
ide_req(rq)->type = ATA_PRIV_TASKFILE;
- err = blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(drive->queue, NULL, rq, 0);
+ err = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
return err;
@@ -227,8 +228,8 @@ static int generic_drive_reset(ide_drive_t *drive)
ide_req(rq)->type = ATA_PRIV_MISC;
scsi_req(rq)->cmd_len = 1;
scsi_req(rq)->cmd[0] = REQ_DRIVE_RESET;
- if (blk_execute_rq(drive->queue, NULL, rq, 1))
- ret = rq->errors;
+ blk_execute_rq(drive->queue, NULL, rq, 1);
+ ret = scsi_req(rq)->result;
blk_put_request(rq);
return ret;
}
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 101aed9a61ca3..94e3107f59b93 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -37,7 +37,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
scsi_req(rq)->cmd_len = 1;
ide_req(rq)->type = ATA_PRIV_MISC;
rq->special = &timeout;
- rc = blk_execute_rq(q, NULL, rq, 1);
+ blk_execute_rq(q, NULL, rq, 1);
+ rc = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
if (rc)
goto out;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ec951be4b0c8a..0977fc1f40ce4 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -27,7 +27,8 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
mesg.event = PM_EVENT_FREEZE;
rqpm.pm_state = mesg.event;
- ret = blk_execute_rq(drive->queue, NULL, rq, 0);
+ blk_execute_rq(drive->queue, NULL, rq, 0);
+ ret = scsi_req(rq)->result ? -EIO : 0;
blk_put_request(rq);
if (ret == 0 && ide_port_acpi(hwif)) {
@@ -55,8 +56,8 @@ static int ide_pm_execute_rq(struct request *rq)
spin_lock_irq(q->queue_lock);
if (unlikely(blk_queue_dying(q))) {
rq->rq_flags |= RQF_QUIET;
- rq->errors = -ENXIO;
- __blk_end_request_all(rq, rq->errors);
+ scsi_req(rq)->result = -ENXIO;
+ __blk_end_request_all(rq, 0);
spin_unlock_irq(q->queue_lock);
return -ENXIO;
}
@@ -66,7 +67,7 @@ static int ide_pm_execute_rq(struct request *rq)
wait_for_completion_io(&wait);
- return rq->errors ? -EIO : 0;
+ return scsi_req(rq)->result ? -EIO : 0;
}
int generic_ide_resume(struct device *dev)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d8a552b47718e..a0651f948b76e 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -366,7 +366,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
err = pc->error;
}
}
- rq->errors = err;
+ scsi_req(rq)->result = err;
return uptodate;
}
@@ -879,7 +879,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
tape->valid = 0;
ret = size;
- if (rq->errors == IDE_DRV_ERROR_GENERAL)
+ if (scsi_req(rq)->result == IDE_DRV_ERROR_GENERAL)
ret = -EIO;
out_put:
blk_put_request(rq);
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4c0007cb74e37..d71199d23c9ec 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -287,7 +287,7 @@ static void ide_pio_datablock(ide_drive_t *drive, struct ide_cmd *cmd,
u8 saved_io_32bit = drive->io_32bit;
if (cmd->tf_flags & IDE_TFLAG_FS)
- cmd->rq->errors = 0;
+ scsi_req(cmd->rq)->result = 0;
if (cmd->tf_flags & IDE_TFLAG_IO_16BIT)
drive->io_32bit = 0;
@@ -329,7 +329,7 @@ void ide_finish_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat)
u8 set_xfer = !!(cmd->tf_flags & IDE_TFLAG_SET_XFER);
ide_complete_cmd(drive, cmd, stat, err);
- rq->errors = err;
+ scsi_req(rq)->result = err;
if (err == 0 && set_xfer) {
ide_set_xfer_rate(drive, nsect);
@@ -452,8 +452,8 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
rq->special = cmd;
cmd->rq = rq;
- error = blk_execute_rq(drive->queue, NULL, rq, 0);
-
+ blk_execute_rq(drive->queue, NULL, rq, 0);
+ error = scsi_req(rq)->result ? -EIO : 0;
put_req:
blk_put_request(rq);
return error;
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 052714106b7b8..ead61a93cb4eb 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -33,4 +33,13 @@ config NVM_RRPC
host. The target is implemented using a linear mapping table and
cost-based garbage collection. It is optimized for 4K IO sizes.
+config NVM_PBLK
+ tristate "Physical Block Device Open-Channel SSD target"
+ ---help---
+ Allows an open-channel SSD to be exposed as a block device to the
+ host. The target assumes the device exposes raw flash and must be
+ explicitly managed by the host.
+
+ Please note the disk format is considered EXPERIMENTAL for now.
+
endif # NVM
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index b2a39e2d28952..82d1a117fb275 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -4,3 +4,8 @@
obj-$(CONFIG_NVM) := core.o
obj-$(CONFIG_NVM_RRPC) += rrpc.o
+obj-$(CONFIG_NVM_PBLK) += pblk.o
+pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
+ pblk-write.o pblk-cache.o pblk-read.o \
+ pblk-gc.o pblk-recovery.o pblk-map.o \
+ pblk-rl.o pblk-sysfs.o
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 5262ba66a7a74..54a06c3a2b8c7 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -89,7 +89,7 @@ static void nvm_release_luns_err(struct nvm_dev *dev, int lun_begin,
WARN_ON(!test_and_clear_bit(i, dev->lun_map));
}
-static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
+static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
{
struct nvm_dev *dev = tgt_dev->parent;
struct nvm_dev_map *dev_map = tgt_dev->map;
@@ -100,11 +100,14 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev)
int *lun_offs = ch_map->lun_offs;
int ch = i + ch_map->ch_off;
- for (j = 0; j < ch_map->nr_luns; j++) {
- int lun = j + lun_offs[j];
- int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+ if (clear) {
+ for (j = 0; j < ch_map->nr_luns; j++) {
+ int lun = j + lun_offs[j];
+ int lunid = (ch * dev->geo.luns_per_chnl) + lun;
- WARN_ON(!test_and_clear_bit(lunid, dev->lun_map));
+ WARN_ON(!test_and_clear_bit(lunid,
+ dev->lun_map));
+ }
}
kfree(ch_map->lun_offs);
@@ -232,6 +235,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
struct nvm_target *t;
struct nvm_tgt_dev *tgt_dev;
void *targetdata;
+ int ret;
tt = nvm_find_target_type(create->tgttype, 1);
if (!tt) {
@@ -252,34 +256,43 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
return -ENOMEM;
t = kmalloc(sizeof(struct nvm_target), GFP_KERNEL);
- if (!t)
+ if (!t) {
+ ret = -ENOMEM;
goto err_reserve;
+ }
tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
if (!tgt_dev) {
pr_err("nvm: could not create target device\n");
+ ret = -ENOMEM;
goto err_t;
}
- tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
- if (!tqueue)
+ tdisk = alloc_disk(0);
+ if (!tdisk) {
+ ret = -ENOMEM;
goto err_dev;
- blk_queue_make_request(tqueue, tt->make_rq);
+ }
- tdisk = alloc_disk(0);
- if (!tdisk)
- goto err_queue;
+ tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
+ if (!tqueue) {
+ ret = -ENOMEM;
+ goto err_disk;
+ }
+ blk_queue_make_request(tqueue, tt->make_rq);
- sprintf(tdisk->disk_name, "%s", create->tgtname);
+ strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name));
tdisk->flags = GENHD_FL_EXT_DEVT;
tdisk->major = 0;
tdisk->first_minor = 0;
tdisk->fops = &nvm_fops;
tdisk->queue = tqueue;
- targetdata = tt->init(tgt_dev, tdisk);
- if (IS_ERR(targetdata))
+ targetdata = tt->init(tgt_dev, tdisk, create->flags);
+ if (IS_ERR(targetdata)) {
+ ret = PTR_ERR(targetdata);
goto err_init;
+ }
tdisk->private_data = targetdata;
tqueue->queuedata = targetdata;
@@ -289,8 +302,10 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
set_capacity(tdisk, tt->capacity(targetdata));
add_disk(tdisk);
- if (tt->sysfs_init && tt->sysfs_init(tdisk))
+ if (tt->sysfs_init && tt->sysfs_init(tdisk)) {
+ ret = -ENOMEM;
goto err_sysfs;
+ }
t->type = tt;
t->disk = tdisk;
@@ -305,16 +320,17 @@ err_sysfs:
if (tt->exit)
tt->exit(targetdata);
err_init:
- put_disk(tdisk);
-err_queue:
blk_cleanup_queue(tqueue);
+ tdisk->queue = NULL;
+err_disk:
+ put_disk(tdisk);
err_dev:
- nvm_remove_tgt_dev(tgt_dev);
+ nvm_remove_tgt_dev(tgt_dev, 0);
err_t:
kfree(t);
err_reserve:
nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
- return -ENOMEM;
+ return ret;
}
static void __nvm_remove_target(struct nvm_target *t)
@@ -332,7 +348,7 @@ static void __nvm_remove_target(struct nvm_target *t)
if (tt->exit)
tt->exit(tdisk->private_data);
- nvm_remove_tgt_dev(t->dev);
+ nvm_remove_tgt_dev(t->dev, 1);
put_disk(tdisk);
list_del(&t->list);
@@ -411,6 +427,18 @@ err_rmap:
return -ENOMEM;
}
+static void nvm_unregister_map(struct nvm_dev *dev)
+{
+ struct nvm_dev_map *rmap = dev->rmap;
+ int i;
+
+ for (i = 0; i < dev->geo.nr_chnls; i++)
+ kfree(rmap->chnls[i].lun_offs);
+
+ kfree(rmap->chnls);
+ kfree(rmap);
+}
+
static void nvm_map_to_dev(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *p)
{
struct nvm_dev_map *dev_map = tgt_dev->map;
@@ -486,7 +514,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
int *lun_roffs;
struct ppa_addr gaddr;
u64 pba = le64_to_cpu(entries[i]);
- int off;
u64 diff;
if (!pba)
@@ -496,8 +523,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
lun_roffs = ch_rmap->lun_offs;
- off = gaddr.g.ch * geo->luns_per_chnl + gaddr.g.lun;
-
diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
(lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
@@ -590,11 +615,11 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
memset(&rqd, 0, sizeof(struct nvm_rq));
- nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas, 1);
+ nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
nvm_rq_tgt_to_dev(tgt_dev, &rqd);
ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type);
- nvm_free_rqd_ppalist(dev, &rqd);
+ nvm_free_rqd_ppalist(tgt_dev, &rqd);
if (ret) {
pr_err("nvm: failed bb mark\n");
return -EINVAL;
@@ -626,34 +651,45 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
}
EXPORT_SYMBOL(nvm_submit_io);
-int nvm_erase_blk(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int flags)
+static void nvm_end_io_sync(struct nvm_rq *rqd)
{
- struct nvm_dev *dev = tgt_dev->parent;
- struct nvm_rq rqd;
- int ret;
+ struct completion *waiting = rqd->private;
- if (!dev->ops->erase_block)
- return 0;
+ complete(waiting);
+}
- nvm_map_to_dev(tgt_dev, ppas);
+int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
+ int nr_ppas)
+{
+ struct nvm_geo *geo = &tgt_dev->geo;
+ struct nvm_rq rqd;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
memset(&rqd, 0, sizeof(struct nvm_rq));
- ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, 1, 1);
+ rqd.opcode = NVM_OP_ERASE;
+ rqd.end_io = nvm_end_io_sync;
+ rqd.private = &wait;
+ rqd.flags = geo->plane_mode >> 1;
+
+ ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1);
if (ret)
return ret;
- nvm_rq_tgt_to_dev(tgt_dev, &rqd);
-
- rqd.flags = flags;
-
- ret = dev->ops->erase_block(dev, &rqd);
+ ret = nvm_submit_io(tgt_dev, &rqd);
+ if (ret) {
+ pr_err("rrpr: erase I/O submission failed: %d\n", ret);
+ goto free_ppa_list;
+ }
+ wait_for_completion_io(&wait);
- nvm_free_rqd_ppalist(dev, &rqd);
+free_ppa_list:
+ nvm_free_rqd_ppalist(tgt_dev, &rqd);
return ret;
}
-EXPORT_SYMBOL(nvm_erase_blk);
+EXPORT_SYMBOL(nvm_erase_sync);
int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
nvm_l2p_update_fn *update_l2p, void *priv)
@@ -732,10 +768,11 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
}
EXPORT_SYMBOL(nvm_put_area);
-int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
+int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd,
const struct ppa_addr *ppas, int nr_ppas, int vblk)
{
- struct nvm_geo *geo = &dev->geo;
+ struct nvm_dev *dev = tgt_dev->parent;
+ struct nvm_geo *geo = &tgt_dev->geo;
int i, plane_cnt, pl_idx;
struct ppa_addr ppa;
@@ -773,12 +810,12 @@ int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd,
}
EXPORT_SYMBOL(nvm_set_rqd_ppalist);
-void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd)
+void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
{
if (!rqd->ppa_list)
return;
- nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list);
+ nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
}
EXPORT_SYMBOL(nvm_free_rqd_ppalist);
@@ -972,7 +1009,7 @@ err_fmtype:
return ret;
}
-void nvm_free(struct nvm_dev *dev)
+static void nvm_free(struct nvm_dev *dev)
{
if (!dev)
return;
@@ -980,7 +1017,7 @@ void nvm_free(struct nvm_dev *dev)
if (dev->dma_pool)
dev->ops->destroy_dma_pool(dev->dma_pool);
- kfree(dev->rmap);
+ nvm_unregister_map(dev);
kfree(dev->lptbl);
kfree(dev->lun_map);
kfree(dev);
@@ -1174,13 +1211,13 @@ static long nvm_ioctl_get_devices(struct file *file, void __user *arg)
list_for_each_entry(dev, &nvm_devices, devices) {
struct nvm_ioctl_device_info *info = &devices->info[i];
- sprintf(info->devname, "%s", dev->name);
+ strlcpy(info->devname, dev->name, sizeof(info->devname));
/* kept for compatibility */
info->bmversion[0] = 1;
info->bmversion[1] = 0;
info->bmversion[2] = 0;
- sprintf(info->bmname, "%s", "gennvm");
+ strlcpy(info->bmname, "gennvm", sizeof(info->bmname));
i++;
if (i > 31) {
@@ -1217,8 +1254,16 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
create.tgtname[DISK_NAME_LEN - 1] = '\0';
if (create.flags != 0) {
- pr_err("nvm: no flags supported\n");
- return -EINVAL;
+ __u32 flags = create.flags;
+
+ /* Check for valid flags */
+ if (flags & NVM_TARGET_FACTORY)
+ flags &= ~NVM_TARGET_FACTORY;
+
+ if (flags) {
+ pr_err("nvm: flag not supported\n");
+ return -EINVAL;
+ }
}
return __nvm_configure_create(&create);
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
new file mode 100644
index 0000000000000..59bcea88db842
--- /dev/null
+++ b/drivers/lightnvm/pblk-cache.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-cache.c - pblk's write cache
+ */
+
+#include "pblk.h"
+
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
+{
+ struct pblk_w_ctx w_ctx;
+ sector_t lba = pblk_get_lba(bio);
+ unsigned int bpos, pos;
+ int nr_entries = pblk_get_secs(bio);
+ int i, ret;
+
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+retry:
+ ret = pblk_rb_may_write_user(&pblk->rwb, bio, nr_entries, &bpos);
+ if (ret == NVM_IO_REQUEUE) {
+ io_schedule();
+ goto retry;
+ }
+
+ if (unlikely(!bio_has_data(bio)))
+ goto out;
+
+ w_ctx.flags = flags;
+ pblk_ppa_set_empty(&w_ctx.ppa);
+
+ for (i = 0; i < nr_entries; i++) {
+ void *data = bio_data(bio);
+
+ w_ctx.lba = lba + i;
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + i);
+ pblk_rb_write_entry_user(&pblk->rwb, data, w_ctx, pos);
+
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(nr_entries, &pblk->inflight_writes);
+ atomic_long_add(nr_entries, &pblk->req_writes);
+#endif
+
+out:
+ pblk_write_should_kick(pblk);
+ return ret;
+}
+
+/*
+ * On GC the incoming lbas are not necessarily sequential. Also, some of the
+ * lbas might not be valid entries, which are marked as empty by the GC thread
+ */
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+ unsigned int nr_entries, unsigned int nr_rec_entries,
+ struct pblk_line *gc_line, unsigned long flags)
+{
+ struct pblk_w_ctx w_ctx;
+ unsigned int bpos, pos;
+ int i, valid_entries;
+
+ /* Update the write buffer head (mem) with the entries that we can
+ * write. The write in itself cannot fail, so there is no need to
+ * rollback from here on.
+ */
+retry:
+ if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) {
+ io_schedule();
+ goto retry;
+ }
+
+ w_ctx.flags = flags;
+ pblk_ppa_set_empty(&w_ctx.ppa);
+
+ for (i = 0, valid_entries = 0; i < nr_entries; i++) {
+ if (lba_list[i] == ADDR_EMPTY)
+ continue;
+
+ w_ctx.lba = lba_list[i];
+
+ pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries);
+ pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos);
+
+ data += PBLK_EXPOSED_PAGE_SIZE;
+ valid_entries++;
+ }
+
+ WARN_ONCE(nr_rec_entries != valid_entries,
+ "pblk: inconsistent GC write\n");
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(valid_entries, &pblk->inflight_writes);
+ atomic_long_add(valid_entries, &pblk->recov_gc_writes);
+#endif
+
+ pblk_write_should_kick(pblk);
+ return NVM_IO_OK;
+}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
new file mode 100644
index 0000000000000..5e44768ccffa8
--- /dev/null
+++ b/drivers/lightnvm/pblk-core.c
@@ -0,0 +1,1667 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-core.c - pblk's core functionality
+ *
+ */
+
+#include "pblk.h"
+#include <linux/time.h>
+
+static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
+ struct ppa_addr *ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int pos = pblk_dev_ppa_to_pos(geo, *ppa);
+
+ pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
+ atomic_long_inc(&pblk->erase_failed);
+
+ atomic_dec(&line->blk_in_line);
+ if (test_and_set_bit(pos, line->blk_bitmap))
+ pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n",
+ line->id, pos);
+
+ pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb);
+}
+
+static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct pblk_line *line;
+
+ line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
+ atomic_dec(&line->left_seblks);
+
+ if (rqd->error) {
+ struct ppa_addr *ppa;
+
+ ppa = kmalloc(sizeof(struct ppa_addr), GFP_ATOMIC);
+ if (!ppa)
+ return;
+
+ *ppa = rqd->ppa_addr;
+ pblk_mark_bb(pblk, line, ppa);
+ }
+}
+
+/* Erase completion assumes that only one block is erased at the time */
+static void pblk_end_io_erase(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+
+ up(&pblk->erase_sem);
+ __pblk_end_io_erase(pblk, rqd);
+ mempool_free(rqd, pblk->r_rq_pool);
+}
+
+static void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list = NULL;
+
+ /* Lines being reclaimed (GC'ed) cannot be invalidated. Before the L2P
+ * table is modified with reclaimed sectors, a check is done to endure
+ * that newer updates are not overwritten.
+ */
+ spin_lock(&line->lock);
+ if (line->state == PBLK_LINESTATE_GC ||
+ line->state == PBLK_LINESTATE_FREE) {
+ spin_unlock(&line->lock);
+ return;
+ }
+
+ if (test_and_set_bit(paddr, line->invalid_bitmap)) {
+ WARN_ONCE(1, "pblk: double invalidate\n");
+ spin_unlock(&line->lock);
+ return;
+ }
+ line->vsc--;
+
+ if (line->state == PBLK_LINESTATE_CLOSED)
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ if (move_list) {
+ spin_lock(&l_mg->gc_lock);
+ spin_lock(&line->lock);
+ /* Prevent moving a line that has just been chosen for GC */
+ if (line->state == PBLK_LINESTATE_GC ||
+ line->state == PBLK_LINESTATE_FREE) {
+ spin_unlock(&line->lock);
+ spin_unlock(&l_mg->gc_lock);
+ return;
+ }
+ spin_unlock(&line->lock);
+
+ list_move_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+ }
+}
+
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct pblk_line *line;
+ u64 paddr;
+ int line_id;
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a device address */
+ BUG_ON(pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_ppa_empty(ppa));
+#endif
+
+ line_id = pblk_tgt_ppa_to_line(ppa);
+ line = &pblk->lines[line_id];
+ paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
+
+ __pblk_map_invalidate(pblk, line, paddr);
+}
+
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr)
+{
+ __pblk_map_invalidate(pblk, line, paddr);
+
+ pblk_rb_sync_init(&pblk->rwb, NULL);
+ line->left_ssecs--;
+ if (!line->left_ssecs)
+ pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+ pblk_rb_sync_end(&pblk->rwb, NULL);
+}
+
+static void pblk_invalidate_range(struct pblk *pblk, sector_t slba,
+ unsigned int nr_secs)
+{
+ sector_t lba;
+
+ spin_lock(&pblk->trans_lock);
+ for (lba = slba; lba < slba + nr_secs; lba++) {
+ struct ppa_addr ppa;
+
+ ppa = pblk_trans_map_get(pblk, lba);
+
+ if (!pblk_addr_in_cache(ppa) && !pblk_ppa_empty(ppa))
+ pblk_map_invalidate(pblk, ppa);
+
+ pblk_ppa_set_empty(&ppa);
+ pblk_trans_map_set(pblk, lba, ppa);
+ }
+ spin_unlock(&pblk->trans_lock);
+}
+
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw)
+{
+ mempool_t *pool;
+ struct nvm_rq *rqd;
+ int rq_size;
+
+ if (rw == WRITE) {
+ pool = pblk->w_rq_pool;
+ rq_size = pblk_w_rq_size;
+ } else {
+ pool = pblk->r_rq_pool;
+ rq_size = pblk_r_rq_size;
+ }
+
+ rqd = mempool_alloc(pool, GFP_KERNEL);
+ memset(rqd, 0, rq_size);
+
+ return rqd;
+}
+
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw)
+{
+ mempool_t *pool;
+
+ if (rw == WRITE)
+ pool = pblk->w_rq_pool;
+ else
+ pool = pblk->r_rq_pool;
+
+ mempool_free(rqd, pool);
+}
+
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages)
+{
+ struct bio_vec bv;
+ int i;
+
+ WARN_ON(off + nr_pages != bio->bi_vcnt);
+
+ bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE);
+ for (i = off; i < nr_pages + off; i++) {
+ bv = bio->bi_io_vec[i];
+ mempool_free(bv.bv_page, pblk->page_pool);
+ }
+}
+
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages)
+{
+ struct request_queue *q = pblk->dev->q;
+ struct page *page;
+ int i, ret;
+
+ for (i = 0; i < nr_pages; i++) {
+ page = mempool_alloc(pblk->page_pool, flags);
+ if (!page)
+ goto err;
+
+ ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0);
+ if (ret != PBLK_EXPOSED_PAGE_SIZE) {
+ pr_err("pblk: could not add page to bio\n");
+ mempool_free(page, pblk->page_pool);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ pblk_bio_free_pages(pblk, bio, 0, i - 1);
+ return -1;
+}
+
+static void pblk_write_kick(struct pblk *pblk)
+{
+ wake_up_process(pblk->writer_ts);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(1000));
+}
+
+void pblk_write_timer_fn(unsigned long data)
+{
+ struct pblk *pblk = (struct pblk *)data;
+
+ /* kick the write thread every tick to flush outstanding data */
+ pblk_write_kick(pblk);
+}
+
+void pblk_write_should_kick(struct pblk *pblk)
+{
+ unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
+
+ if (secs_avail >= pblk->min_write_pgs)
+ pblk_write_kick(pblk);
+}
+
+void pblk_end_bio_sync(struct bio *bio)
+{
+ struct completion *waiting = bio->bi_private;
+
+ complete(waiting);
+}
+
+void pblk_end_io_sync(struct nvm_rq *rqd)
+{
+ struct completion *waiting = rqd->private;
+
+ complete(waiting);
+}
+
+void pblk_flush_writer(struct pblk *pblk)
+{
+ struct bio *bio;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ if (!bio)
+ return;
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_OP_FLUSH);
+ bio->bi_private = &wait;
+ bio->bi_end_io = pblk_end_bio_sync;
+
+ ret = pblk_write_to_cache(pblk, bio, 0);
+ if (ret == NVM_IO_OK) {
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: flush cache timed out\n");
+ }
+ } else if (ret != NVM_IO_DONE) {
+ pr_err("pblk: tear down bio failed\n");
+ }
+
+ if (bio->bi_error)
+ pr_err("pblk: flush sync write failed (%u)\n", bio->bi_error);
+
+ bio_put(bio);
+}
+
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list = NULL;
+
+ if (!line->vsc) {
+ if (line->gc_group != PBLK_LINEGC_FULL) {
+ line->gc_group = PBLK_LINEGC_FULL;
+ move_list = &l_mg->gc_full_list;
+ }
+ } else if (line->vsc < lm->mid_thrs) {
+ if (line->gc_group != PBLK_LINEGC_HIGH) {
+ line->gc_group = PBLK_LINEGC_HIGH;
+ move_list = &l_mg->gc_high_list;
+ }
+ } else if (line->vsc < lm->high_thrs) {
+ if (line->gc_group != PBLK_LINEGC_MID) {
+ line->gc_group = PBLK_LINEGC_MID;
+ move_list = &l_mg->gc_mid_list;
+ }
+ } else if (line->vsc < line->sec_in_line) {
+ if (line->gc_group != PBLK_LINEGC_LOW) {
+ line->gc_group = PBLK_LINEGC_LOW;
+ move_list = &l_mg->gc_low_list;
+ }
+ } else if (line->vsc == line->sec_in_line) {
+ if (line->gc_group != PBLK_LINEGC_EMPTY) {
+ line->gc_group = PBLK_LINEGC_EMPTY;
+ move_list = &l_mg->gc_empty_list;
+ }
+ } else {
+ line->state = PBLK_LINESTATE_CORRUPT;
+ line->gc_group = PBLK_LINEGC_NONE;
+ move_list = &l_mg->corrupt_list;
+ pr_err("pblk: corrupted vsc for line %d, vsc:%d (%d/%d/%d)\n",
+ line->id, line->vsc,
+ line->sec_in_line,
+ lm->high_thrs, lm->mid_thrs);
+ }
+
+ return move_list;
+}
+
+void pblk_discard(struct pblk *pblk, struct bio *bio)
+{
+ sector_t slba = pblk_get_lba(bio);
+ sector_t nr_secs = pblk_get_secs(bio);
+
+ pblk_invalidate_range(pblk, slba, nr_secs);
+}
+
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba)
+{
+ struct ppa_addr ppa;
+
+ spin_lock(&pblk->trans_lock);
+ ppa = pblk_trans_map_get(pblk, lba);
+ spin_unlock(&pblk->trans_lock);
+
+ return ppa;
+}
+
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ atomic_long_inc(&pblk->write_failed);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ /* Empty page read is not necessarily an error (e.g., L2P recovery) */
+ if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+ atomic_long_inc(&pblk->read_empty);
+ return;
+ }
+
+ switch (rqd->error) {
+ case NVM_RSP_WARN_HIGHECC:
+ atomic_long_inc(&pblk->read_high_ecc);
+ break;
+ case NVM_RSP_ERR_FAILECC:
+ case NVM_RSP_ERR_FAILCRC:
+ atomic_long_inc(&pblk->read_failed);
+ break;
+ default:
+ pr_err("pblk: unknown read error:%d\n", rqd->error);
+ }
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+}
+
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+#ifdef CONFIG_NVM_DEBUG
+ struct ppa_addr *ppa_list;
+
+ ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr;
+ if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (rqd->opcode == NVM_OP_PWRITE) {
+ struct pblk_line *line;
+ struct ppa_addr ppa;
+ int i;
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ ppa = ppa_list[i];
+ line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+
+ spin_lock(&line->lock);
+ if (line->state != PBLK_LINESTATE_OPEN) {
+ pr_err("pblk: bad ppa: line:%d,state:%d\n",
+ line->id, line->state);
+ WARN_ON(1);
+ spin_unlock(&line->lock);
+ return -EINVAL;
+ }
+ spin_unlock(&line->lock);
+ }
+ }
+#endif
+ return nvm_submit_io(dev, rqd);
+}
+
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+ unsigned int nr_secs, unsigned int len,
+ gfp_t gfp_mask)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ void *kaddr = data;
+ struct page *page;
+ struct bio *bio;
+ int i, ret;
+
+ if (l_mg->emeta_alloc_type == PBLK_KMALLOC_META)
+ return bio_map_kern(dev->q, kaddr, len, gfp_mask);
+
+ bio = bio_kmalloc(gfp_mask, nr_secs);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < nr_secs; i++) {
+ page = vmalloc_to_page(kaddr);
+ if (!page) {
+ pr_err("pblk: could not map vmalloc bio\n");
+ bio_put(bio);
+ bio = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ ret = bio_add_pc_page(dev->q, bio, page, PAGE_SIZE, 0);
+ if (ret != PAGE_SIZE) {
+ pr_err("pblk: could not add page to bio\n");
+ bio_put(bio);
+ bio = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ kaddr += PAGE_SIZE;
+ }
+out:
+ return bio;
+}
+
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+ unsigned long secs_to_flush)
+{
+ int max = pblk->max_write_pgs;
+ int min = pblk->min_write_pgs;
+ int secs_to_sync = 0;
+
+ if (secs_avail >= max)
+ secs_to_sync = max;
+ else if (secs_avail >= min)
+ secs_to_sync = min * (secs_avail / min);
+ else if (secs_to_flush)
+ secs_to_sync = min;
+
+ return secs_to_sync;
+}
+
+static u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line,
+ int nr_secs)
+{
+ u64 addr;
+ int i;
+
+ /* logic error: ppa out-of-bounds. Prevent generating bad address */
+ if (line->cur_sec + nr_secs > pblk->lm.sec_per_line) {
+ WARN(1, "pblk: page allocation out of bounds\n");
+ nr_secs = pblk->lm.sec_per_line - line->cur_sec;
+ }
+
+ line->cur_sec = addr = find_next_zero_bit(line->map_bitmap,
+ pblk->lm.sec_per_line, line->cur_sec);
+ for (i = 0; i < nr_secs; i++, line->cur_sec++)
+ WARN_ON(test_and_set_bit(line->cur_sec, line->map_bitmap));
+
+ return addr;
+}
+
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs)
+{
+ u64 addr;
+
+ /* Lock needed in case a write fails and a recovery needs to remap
+ * failed write buffer entries
+ */
+ spin_lock(&line->lock);
+ addr = __pblk_alloc_page(pblk, line, nr_secs);
+ line->left_msecs -= nr_secs;
+ WARN(line->left_msecs < 0, "pblk: page allocation out of bounds\n");
+ spin_unlock(&line->lock);
+
+ return addr;
+}
+
+/*
+ * Submit emeta to one LUN in the raid line at the time to avoid a deadlock when
+ * taking the per LUN semaphore.
+ */
+static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr, int dir)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ struct ppa_addr *ppa_list;
+ dma_addr_t dma_ppa_list;
+ void *emeta = line->emeta;
+ int min = pblk->min_write_pgs;
+ int left_ppas = lm->emeta_sec;
+ int id = line->id;
+ int rq_ppas, rq_len;
+ int cmd_op, bio_op;
+ int flags;
+ int i, j;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ if (dir == WRITE) {
+ bio_op = REQ_OP_WRITE;
+ cmd_op = NVM_OP_PWRITE;
+ flags = pblk_set_progr_mode(pblk, WRITE);
+ } else if (dir == READ) {
+ bio_op = REQ_OP_READ;
+ cmd_op = NVM_OP_PREAD;
+ flags = pblk_set_read_mode(pblk);
+ } else
+ return -EINVAL;
+
+ ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_ppa_list);
+ if (!ppa_list)
+ return -ENOMEM;
+
+next_rq:
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = pblk_bio_map_addr(pblk, emeta, rq_ppas, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ goto free_rqd_dma;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, bio_op, 0);
+
+ rqd.bio = bio;
+ rqd.opcode = cmd_op;
+ rqd.flags = flags;
+ rqd.nr_ppas = rq_ppas;
+ rqd.ppa_list = ppa_list;
+ rqd.dma_ppa_list = dma_ppa_list;
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+
+ if (dir == WRITE) {
+ for (i = 0; i < rqd.nr_ppas; ) {
+ spin_lock(&line->lock);
+ paddr = __pblk_alloc_page(pblk, line, min);
+ spin_unlock(&line->lock);
+ for (j = 0; j < min; j++, i++, paddr++)
+ rqd.ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, id);
+ }
+ } else {
+ for (i = 0; i < rqd.nr_ppas; ) {
+ struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
+ int pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ paddr += min;
+ if (pblk_boundary_paddr_checks(pblk, paddr)) {
+ pr_err("pblk: corrupt emeta line:%d\n",
+ line->id);
+ bio_put(bio);
+ ret = -EINTR;
+ goto free_rqd_dma;
+ }
+
+ ppa = addr_to_gen_ppa(pblk, paddr, id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
+ pr_err("pblk: corrupt emeta line:%d\n",
+ line->id);
+ bio_put(bio);
+ ret = -EINTR;
+ goto free_rqd_dma;
+ }
+
+ for (j = 0; j < min; j++, i++, paddr++)
+ rqd.ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, line->id);
+ }
+ }
+
+ ret = pblk_submit_io(pblk, &rqd);
+ if (ret) {
+ pr_err("pblk: emeta I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ goto free_rqd_dma;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: emeta I/O timed out\n");
+ }
+ reinit_completion(&wait);
+
+ bio_put(bio);
+
+ if (rqd.error) {
+ if (dir == WRITE)
+ pblk_log_write_err(pblk, &rqd);
+ else
+ pblk_log_read_err(pblk, &rqd);
+ }
+
+ emeta += rq_len;
+ left_ppas -= rq_ppas;
+ if (left_ppas)
+ goto next_rq;
+free_rqd_dma:
+ nvm_dev_dma_free(dev->parent, ppa_list, dma_ppa_list);
+ return ret;
+}
+
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int bit;
+
+ /* This usually only happens on bad lines */
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (bit >= lm->blk_per_line)
+ return -1;
+
+ return bit * geo->sec_per_pl;
+}
+
+static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr, int dir)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ __le64 *lba_list = NULL;
+ int i, ret;
+ int cmd_op, bio_op;
+ int flags;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ if (dir == WRITE) {
+ bio_op = REQ_OP_WRITE;
+ cmd_op = NVM_OP_PWRITE;
+ flags = pblk_set_progr_mode(pblk, WRITE);
+ lba_list = pblk_line_emeta_to_lbas(line->emeta);
+ } else if (dir == READ) {
+ bio_op = REQ_OP_READ;
+ cmd_op = NVM_OP_PREAD;
+ flags = pblk_set_read_mode(pblk);
+ } else
+ return -EINVAL;
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd.dma_ppa_list);
+ if (!rqd.ppa_list)
+ return -ENOMEM;
+
+ bio = bio_map_kern(dev->q, line->smeta, lm->smeta_len, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ ret = PTR_ERR(bio);
+ goto free_ppa_list;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, bio_op, 0);
+
+ rqd.bio = bio;
+ rqd.opcode = cmd_op;
+ rqd.flags = flags;
+ rqd.nr_ppas = lm->smeta_sec;
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+
+ for (i = 0; i < lm->smeta_sec; i++, paddr++) {
+ rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+ if (dir == WRITE)
+ lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+ }
+
+ /*
+ * This I/O is sent by the write thread when a line is replace. Since
+ * the write thread is the only one sending write and erase commands,
+ * there is no need to take the LUN semaphore.
+ */
+ ret = pblk_submit_io(pblk, &rqd);
+ if (ret) {
+ pr_err("pblk: smeta I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ goto free_ppa_list;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: smeta I/O timed out\n");
+ }
+
+ if (rqd.error) {
+ if (dir == WRITE)
+ pblk_log_write_err(pblk, &rqd);
+ else
+ pblk_log_read_err(pblk, &rqd);
+ }
+
+free_ppa_list:
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+
+ return ret;
+}
+
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
+{
+ u64 bpaddr = pblk_line_smeta_start(pblk, line);
+
+ return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ);
+}
+
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+ return pblk_line_submit_emeta_io(pblk, line, line->emeta_ssec, READ);
+}
+
+static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct ppa_addr ppa)
+{
+ rqd->opcode = NVM_OP_ERASE;
+ rqd->ppa_addr = ppa;
+ rqd->nr_ppas = 1;
+ rqd->flags = pblk_set_progr_mode(pblk, ERASE);
+ rqd->bio = NULL;
+}
+
+static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct nvm_rq rqd;
+ int ret;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ pblk_setup_e_rq(pblk, &rqd, ppa);
+
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+
+ /* The write thread schedules erases so that it minimizes disturbances
+ * with writes. Thus, there is no need to take the LUN semaphore.
+ */
+ ret = pblk_submit_io(pblk, &rqd);
+ if (ret) {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ pr_err("pblk: could not sync erase line:%d,blk:%d\n",
+ pblk_dev_ppa_to_line(ppa),
+ pblk_dev_ppa_to_pos(geo, ppa));
+
+ rqd.error = ret;
+ goto out;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: sync erase timed out\n");
+ }
+
+out:
+ rqd.private = pblk;
+ __pblk_end_io_erase(pblk, &rqd);
+
+ return 0;
+}
+
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct ppa_addr ppa;
+ int bit = -1;
+
+ /* Erase only good blocks, one at a time */
+ do {
+ spin_lock(&line->lock);
+ bit = find_next_zero_bit(line->erase_bitmap, lm->blk_per_line,
+ bit + 1);
+ if (bit >= lm->blk_per_line) {
+ spin_unlock(&line->lock);
+ break;
+ }
+
+ ppa = pblk->luns[bit].bppa; /* set ch and lun */
+ ppa.g.blk = line->id;
+
+ atomic_dec(&line->left_eblks);
+ WARN_ON(test_and_set_bit(bit, line->erase_bitmap));
+ spin_unlock(&line->lock);
+
+ if (pblk_blk_erase_sync(pblk, ppa)) {
+ pr_err("pblk: failed to erase line %d\n", line->id);
+ return -ENOMEM;
+ }
+ } while (1);
+
+ return 0;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_set_metadata(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_line *cur)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct line_smeta *smeta = line->smeta;
+ struct line_emeta *emeta = line->emeta;
+ int nr_blk_line;
+
+ /* After erasing the line, new bad blocks might appear and we risk
+ * having an invalid line
+ */
+ nr_blk_line = lm->blk_per_line -
+ bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+ if (nr_blk_line < lm->min_blk_line) {
+ spin_lock(&l_mg->free_lock);
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+ spin_unlock(&l_mg->free_lock);
+
+ pr_debug("pblk: line %d is bad\n", line->id);
+
+ return 0;
+ }
+
+ /* Run-time metadata */
+ line->lun_bitmap = ((void *)(smeta)) + sizeof(struct line_smeta);
+
+ /* Mark LUNs allocated in this line (all for now) */
+ bitmap_set(line->lun_bitmap, 0, lm->lun_bitmap_len);
+
+ smeta->header.identifier = cpu_to_le32(PBLK_MAGIC);
+ memcpy(smeta->header.uuid, pblk->instance_uuid, 16);
+ smeta->header.id = cpu_to_le32(line->id);
+ smeta->header.type = cpu_to_le16(line->type);
+ smeta->header.version = cpu_to_le16(1);
+
+ /* Start metadata */
+ smeta->seq_nr = cpu_to_le64(line->seq_nr);
+ smeta->window_wr_lun = cpu_to_le32(geo->nr_luns);
+
+ /* Fill metadata among lines */
+ if (cur) {
+ memcpy(line->lun_bitmap, cur->lun_bitmap, lm->lun_bitmap_len);
+ smeta->prev_id = cpu_to_le32(cur->id);
+ cur->emeta->next_id = cpu_to_le32(line->id);
+ } else {
+ smeta->prev_id = cpu_to_le32(PBLK_LINE_EMPTY);
+ }
+
+ /* All smeta must be set at this point */
+ smeta->header.crc = cpu_to_le32(pblk_calc_meta_header_crc(pblk, smeta));
+ smeta->crc = cpu_to_le32(pblk_calc_smeta_crc(pblk, smeta));
+
+ /* End metadata */
+ memcpy(&emeta->header, &smeta->header, sizeof(struct line_header));
+ emeta->seq_nr = cpu_to_le64(line->seq_nr);
+ emeta->nr_lbas = cpu_to_le64(line->sec_in_line);
+ emeta->nr_valid_lbas = cpu_to_le64(0);
+ emeta->next_id = cpu_to_le32(PBLK_LINE_EMPTY);
+ emeta->crc = cpu_to_le32(0);
+ emeta->prev_id = smeta->prev_id;
+
+ return 1;
+}
+
+/* For now lines are always assumed full lines. Thus, smeta former and current
+ * lun bitmaps are omitted.
+ */
+static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
+ int init)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int nr_bb = 0;
+ u64 off;
+ int bit = -1;
+
+ line->sec_in_line = lm->sec_per_line;
+
+ /* Capture bad block information on line mapping bitmaps */
+ while ((bit = find_next_bit(line->blk_bitmap, lm->blk_per_line,
+ bit + 1)) < lm->blk_per_line) {
+ off = bit * geo->sec_per_pl;
+ bitmap_shift_left(l_mg->bb_aux, l_mg->bb_template, off,
+ lm->sec_per_line);
+ bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
+ lm->sec_per_line);
+ line->sec_in_line -= geo->sec_per_blk;
+ if (bit >= lm->emeta_bb)
+ nr_bb++;
+ }
+
+ /* Mark smeta metadata sectors as bad sectors */
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ off = bit * geo->sec_per_pl;
+retry_smeta:
+ bitmap_set(line->map_bitmap, off, lm->smeta_sec);
+ line->sec_in_line -= lm->smeta_sec;
+ line->smeta_ssec = off;
+ line->cur_sec = off + lm->smeta_sec;
+
+ if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) {
+ pr_debug("pblk: line smeta I/O failed. Retry\n");
+ off += geo->sec_per_pl;
+ goto retry_smeta;
+ }
+
+ bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
+
+ /* Mark emeta metadata sectors as bad sectors. We need to consider bad
+ * blocks to make sure that there are enough sectors to store emeta
+ */
+ bit = lm->sec_per_line;
+ off = lm->sec_per_line - lm->emeta_sec;
+ bitmap_set(line->invalid_bitmap, off, lm->emeta_sec);
+ while (nr_bb) {
+ off -= geo->sec_per_pl;
+ if (!test_bit(off, line->invalid_bitmap)) {
+ bitmap_set(line->invalid_bitmap, off, geo->sec_per_pl);
+ nr_bb--;
+ }
+ }
+
+ line->sec_in_line -= lm->emeta_sec;
+ line->emeta_ssec = off;
+ line->vsc = line->left_ssecs = line->left_msecs = line->sec_in_line;
+
+ if (lm->sec_per_line - line->sec_in_line !=
+ bitmap_weight(line->invalid_bitmap, lm->sec_per_line)) {
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+ pr_err("pblk: unexpected line %d is bad\n", line->id);
+
+ return 0;
+ }
+
+ return 1;
+}
+
+static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ int blk_in_line = atomic_read(&line->blk_in_line);
+
+ line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+ if (!line->map_bitmap)
+ return -ENOMEM;
+ memset(line->map_bitmap, 0, lm->sec_bitmap_len);
+
+ /* invalid_bitmap is special since it is used when line is closed. No
+ * need to zeroized; it will be initialized using bb info form
+ * map_bitmap
+ */
+ line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC);
+ if (!line->invalid_bitmap) {
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ return -ENOMEM;
+ }
+
+ spin_lock(&line->lock);
+ if (line->state != PBLK_LINESTATE_FREE) {
+ spin_unlock(&line->lock);
+ WARN(1, "pblk: corrupted line state\n");
+ return -EINTR;
+ }
+ line->state = PBLK_LINESTATE_OPEN;
+
+ atomic_set(&line->left_eblks, blk_in_line);
+ atomic_set(&line->left_seblks, blk_in_line);
+ spin_unlock(&line->lock);
+
+ /* Bad blocks do not need to be erased */
+ bitmap_copy(line->erase_bitmap, line->blk_bitmap, lm->blk_per_line);
+
+ kref_init(&line->ref);
+
+ return 0;
+}
+
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int ret;
+
+ spin_lock(&l_mg->free_lock);
+ l_mg->data_line = line;
+ list_del(&line->list);
+
+ ret = pblk_line_prepare(pblk, line);
+ if (ret) {
+ list_add(&line->list, &l_mg->free_list);
+ spin_unlock(&l_mg->free_lock);
+ return ret;
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_dec(&pblk->rl, line);
+
+ if (!pblk_line_init_bb(pblk, line, 0)) {
+ list_add(&line->list, &l_mg->free_list);
+ return -EINTR;
+ }
+
+ return 0;
+}
+
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)
+{
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+}
+
+struct pblk_line *pblk_line_get(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *line = NULL;
+ int bit;
+
+ lockdep_assert_held(&l_mg->free_lock);
+
+retry_get:
+ if (list_empty(&l_mg->free_list)) {
+ pr_err("pblk: no free lines\n");
+ goto out;
+ }
+
+ line = list_first_entry(&l_mg->free_list, struct pblk_line, list);
+ list_del(&line->list);
+ l_mg->nr_free_lines--;
+
+ bit = find_first_zero_bit(line->blk_bitmap, lm->blk_per_line);
+ if (unlikely(bit >= lm->blk_per_line)) {
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_BAD;
+ spin_unlock(&line->lock);
+
+ list_add_tail(&line->list, &l_mg->bad_list);
+
+ pr_debug("pblk: line %d is bad\n", line->id);
+ goto retry_get;
+ }
+
+ if (pblk_line_prepare(pblk, line)) {
+ pr_err("pblk: failed to prepare line %d\n", line->id);
+ list_add(&line->list, &l_mg->free_list);
+ return NULL;
+ }
+
+out:
+ return line;
+}
+
+static struct pblk_line *pblk_line_retry(struct pblk *pblk,
+ struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *retry_line;
+
+ spin_lock(&l_mg->free_lock);
+ retry_line = pblk_line_get(pblk);
+ if (!retry_line) {
+ l_mg->data_line = NULL;
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ retry_line->smeta = line->smeta;
+ retry_line->emeta = line->emeta;
+ retry_line->meta_line = line->meta_line;
+
+ pblk_line_free(pblk, line);
+ l_mg->data_line = retry_line;
+ spin_unlock(&l_mg->free_lock);
+
+ if (pblk_line_erase(pblk, retry_line)) {
+ spin_lock(&l_mg->free_lock);
+ l_mg->data_line = NULL;
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ pblk_rl_free_lines_dec(&pblk->rl, retry_line);
+
+ return retry_line;
+}
+
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ int meta_line;
+ int is_next = 0;
+
+ spin_lock(&l_mg->free_lock);
+ line = pblk_line_get(pblk);
+ if (!line) {
+ spin_unlock(&l_mg->free_lock);
+ return NULL;
+ }
+
+ line->seq_nr = l_mg->d_seq_nr++;
+ line->type = PBLK_LINETYPE_DATA;
+ l_mg->data_line = line;
+
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ line->smeta = l_mg->sline_meta[meta_line].meta;
+ line->emeta = l_mg->eline_meta[meta_line].meta;
+ line->meta_line = meta_line;
+
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (l_mg->data_next) {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ is_next = 1;
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_dec(&pblk->rl, line);
+ if (is_next)
+ pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+ if (pblk_line_erase(pblk, line))
+ return NULL;
+
+retry_setup:
+ if (!pblk_line_set_metadata(pblk, line, NULL)) {
+ line = pblk_line_retry(pblk, line);
+ if (!line)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ if (!pblk_line_init_bb(pblk, line, 1)) {
+ line = pblk_line_retry(pblk, line);
+ if (!line)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ return line;
+}
+
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *cur, *new;
+ unsigned int left_seblks;
+ int meta_line;
+ int is_next = 0;
+
+ cur = l_mg->data_line;
+ new = l_mg->data_next;
+ if (!new)
+ return NULL;
+ l_mg->data_line = new;
+
+retry_line:
+ left_seblks = atomic_read(&new->left_seblks);
+ if (left_seblks) {
+ /* If line is not fully erased, erase it */
+ if (atomic_read(&new->left_eblks)) {
+ if (pblk_line_erase(pblk, new))
+ return NULL;
+ } else {
+ io_schedule();
+ }
+ goto retry_line;
+ }
+
+ spin_lock(&l_mg->free_lock);
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (l_mg->data_next) {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ is_next = 1;
+ }
+
+retry_meta:
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ if (meta_line == PBLK_DATA_LINES) {
+ spin_unlock(&l_mg->free_lock);
+ io_schedule();
+ spin_lock(&l_mg->free_lock);
+ goto retry_meta;
+ }
+
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ new->smeta = l_mg->sline_meta[meta_line].meta;
+ new->emeta = l_mg->eline_meta[meta_line].meta;
+ new->meta_line = meta_line;
+
+ memset(new->smeta, 0, lm->smeta_len);
+ memset(new->emeta, 0, lm->emeta_len);
+ spin_unlock(&l_mg->free_lock);
+
+ if (is_next)
+ pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+
+retry_setup:
+ if (!pblk_line_set_metadata(pblk, new, cur)) {
+ new = pblk_line_retry(pblk, new);
+ if (!new)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ if (!pblk_line_init_bb(pblk, new, 1)) {
+ new = pblk_line_retry(pblk, new);
+ if (!new)
+ return NULL;
+
+ goto retry_setup;
+ }
+
+ return new;
+}
+
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line)
+{
+ if (line->map_bitmap)
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ if (line->invalid_bitmap)
+ mempool_free(line->invalid_bitmap, pblk->line_meta_pool);
+
+ line->map_bitmap = NULL;
+ line->invalid_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+}
+
+void pblk_line_put(struct kref *ref)
+{
+ struct pblk_line *line = container_of(ref, struct pblk_line, ref);
+ struct pblk *pblk = line->pblk;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_GC);
+ line->state = PBLK_LINESTATE_FREE;
+ line->gc_group = PBLK_LINEGC_NONE;
+ pblk_line_free(pblk, line);
+ spin_unlock(&line->lock);
+
+ spin_lock(&l_mg->free_lock);
+ list_add_tail(&line->list, &l_mg->free_list);
+ l_mg->nr_free_lines++;
+ spin_unlock(&l_mg->free_lock);
+
+ pblk_rl_free_lines_inc(&pblk->rl, line);
+}
+
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
+{
+ struct nvm_rq *rqd;
+ int err;
+
+ rqd = mempool_alloc(pblk->r_rq_pool, GFP_KERNEL);
+ memset(rqd, 0, pblk_r_rq_size);
+
+ pblk_setup_e_rq(pblk, rqd, ppa);
+
+ rqd->end_io = pblk_end_io_erase;
+ rqd->private = pblk;
+
+ /* The write thread schedules erases so that it minimizes disturbances
+ * with writes. Thus, there is no need to take the LUN semaphore.
+ */
+ err = pblk_submit_io(pblk, rqd);
+ if (err) {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ pr_err("pblk: could not async erase line:%d,blk:%d\n",
+ pblk_dev_ppa_to_line(ppa),
+ pblk_dev_ppa_to_pos(geo, ppa));
+ }
+
+ return err;
+}
+
+struct pblk_line *pblk_line_get_data(struct pblk *pblk)
+{
+ return pblk->l_mg.data_line;
+}
+
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk)
+{
+ return pblk->l_mg.data_next;
+}
+
+int pblk_line_is_full(struct pblk_line *line)
+{
+ return (line->left_msecs == 0);
+}
+
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list;
+
+ line->emeta->crc = cpu_to_le32(pblk_calc_emeta_crc(pblk, line->emeta));
+
+ if (pblk_line_submit_emeta_io(pblk, line, line->cur_sec, WRITE))
+ pr_err("pblk: line %d close I/O failed\n", line->id);
+
+ WARN(!bitmap_full(line->map_bitmap, line->sec_in_line),
+ "pblk: corrupt closed line %d\n", line->id);
+
+ spin_lock(&l_mg->free_lock);
+ WARN_ON(!test_and_clear_bit(line->meta_line, &l_mg->meta_bitmap));
+ spin_unlock(&l_mg->free_lock);
+
+ spin_lock(&l_mg->gc_lock);
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_OPEN);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+
+ list_add_tail(&line->list, move_list);
+
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+
+ spin_unlock(&line->lock);
+ spin_unlock(&l_mg->gc_lock);
+}
+
+void pblk_line_close_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct pblk_line *line = line_ws->line;
+
+ pblk_line_close(pblk, line);
+ mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_mark_bb(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct ppa_addr *ppa = line_ws->priv;
+ int ret;
+
+ ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD);
+ if (ret) {
+ struct pblk_line *line;
+ int pos;
+
+ line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
+ pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+
+ pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
+ line->id, pos);
+ }
+
+ kfree(ppa);
+ mempool_free(line_ws, pblk->line_ws_pool);
+}
+
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *))
+{
+ struct pblk_line_ws *line_ws;
+
+ line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC);
+ if (!line_ws)
+ return;
+
+ line_ws->pblk = pblk;
+ line_ws->line = line;
+ line_ws->priv = priv;
+
+ INIT_WORK(&line_ws->ws, work);
+ queue_work(pblk->kw_wq, &line_ws->ws);
+}
+
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int lun_id = ppa_list[0].g.ch * geo->luns_per_chnl + ppa_list[0].g.lun;
+ int ret;
+
+ /*
+ * Only send one inflight I/O per LUN. Since we map at a page
+ * granurality, all ppas in the I/O will map to the same LUN
+ */
+#ifdef CONFIG_NVM_DEBUG
+ int i;
+
+ for (i = 1; i < nr_ppas; i++)
+ WARN_ON(ppa_list[0].g.lun != ppa_list[i].g.lun ||
+ ppa_list[0].g.ch != ppa_list[i].g.ch);
+#endif
+ /* If the LUN has been locked for this same request, do no attempt to
+ * lock it again
+ */
+ if (test_and_set_bit(lun_id, lun_bitmap))
+ return;
+
+ rlun = &pblk->luns[lun_id];
+ ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(5000));
+ if (ret) {
+ switch (ret) {
+ case -ETIME:
+ pr_err("pblk: lun semaphore timed out\n");
+ break;
+ case -EINTR:
+ pr_err("pblk: lun semaphore timed out\n");
+ break;
+ }
+ }
+}
+
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int nr_luns = geo->nr_luns;
+ int bit = -1;
+
+ while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
+ rlun = &pblk->luns[bit];
+ up(&rlun->wr_sem);
+ }
+
+ kfree(lun_bitmap);
+}
+
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+ struct ppa_addr l2p_ppa;
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+ if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa))
+ pblk_map_invalidate(pblk, l2p_ppa);
+
+ pblk_trans_map_set(pblk, lba, ppa);
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa)
+{
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(!pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+ pblk_update_map(pblk, lba, ppa);
+}
+
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+ struct pblk_line *gc_line)
+{
+ struct ppa_addr l2p_ppa;
+ int ret = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(!pblk_addr_in_cache(ppa));
+ BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa)));
+#endif
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return 0;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ l2p_ppa = pblk_trans_map_get(pblk, lba);
+
+ /* Prevent updated entries to be overwritten by GC */
+ if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) ||
+ pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) {
+ ret = 0;
+ goto out;
+ }
+
+ pblk_trans_map_set(pblk, lba, ppa);
+out:
+ spin_unlock(&pblk->trans_lock);
+ return ret;
+}
+
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+ struct ppa_addr entry_line)
+{
+ struct ppa_addr l2p_line;
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a device address */
+ BUG_ON(pblk_addr_in_cache(ppa));
+#endif
+ /* Invalidate and discard padded entries */
+ if (lba == ADDR_EMPTY) {
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->padded_wb);
+#endif
+ pblk_map_invalidate(pblk, ppa);
+ return;
+ }
+
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ return;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ l2p_line = pblk_trans_map_get(pblk, lba);
+
+ /* Do not update L2P if the cacheline has been updated. In this case,
+ * the mapped ppa must be invalidated
+ */
+ if (l2p_line.ppa != entry_line.ppa) {
+ if (!pblk_ppa_empty(ppa))
+ pblk_map_invalidate(pblk, ppa);
+ goto out;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line));
+#endif
+
+ pblk_trans_map_set(pblk, lba, ppa);
+out:
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs)
+{
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++)
+ ppas[i] = pblk_trans_map_get(pblk, blba + i);
+ spin_unlock(&pblk->trans_lock);
+}
+
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+ u64 *lba_list, int nr_secs)
+{
+ sector_t lba;
+ int i;
+
+ spin_lock(&pblk->trans_lock);
+ for (i = 0; i < nr_secs; i++) {
+ lba = lba_list[i];
+ if (lba == ADDR_EMPTY) {
+ ppas[i].ppa = ADDR_EMPTY;
+ } else {
+ /* logic error: lba out-of-bounds. Ignore update */
+ if (!(lba < pblk->rl.nr_secs)) {
+ WARN(1, "pblk: corrupted L2P map request\n");
+ continue;
+ }
+ ppas[i] = pblk_trans_map_get(pblk, lba);
+ }
+ }
+ spin_unlock(&pblk->trans_lock);
+}
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
new file mode 100644
index 0000000000000..eaf479c6b63c8
--- /dev/null
+++ b/drivers/lightnvm/pblk-gc.c
@@ -0,0 +1,555 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-gc.c - pblk's garbage collector
+ */
+
+#include "pblk.h"
+#include <linux/delay.h>
+
+static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq)
+{
+ kfree(gc_rq->data);
+ kfree(gc_rq->lba_list);
+ kfree(gc_rq);
+}
+
+static int pblk_gc_write(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_gc_rq *gc_rq, *tgc_rq;
+ LIST_HEAD(w_list);
+
+ spin_lock(&gc->w_lock);
+ if (list_empty(&gc->w_list)) {
+ spin_unlock(&gc->w_lock);
+ return 1;
+ }
+
+ list_for_each_entry_safe(gc_rq, tgc_rq, &gc->w_list, list) {
+ list_move_tail(&gc_rq->list, &w_list);
+ gc->w_entries--;
+ }
+ spin_unlock(&gc->w_lock);
+
+ list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) {
+ pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list,
+ gc_rq->nr_secs, gc_rq->secs_to_gc,
+ gc_rq->line, PBLK_IOTYPE_GC);
+
+ kref_put(&gc_rq->line->ref, pblk_line_put);
+
+ list_del(&gc_rq->list);
+ pblk_gc_free_gc_rq(gc_rq);
+ }
+
+ return 0;
+}
+
+static void pblk_gc_writer_kick(struct pblk_gc *gc)
+{
+ wake_up_process(gc->gc_writer_ts);
+}
+
+/*
+ * Responsible for managing all memory related to a gc request. Also in case of
+ * failure
+ */
+static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_line *line,
+ u64 *lba_list, unsigned int nr_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_gc_rq *gc_rq;
+ void *data;
+ unsigned int secs_to_gc;
+ int ret = NVM_IO_OK;
+
+ data = kmalloc(nr_secs * geo->sec_size, GFP_KERNEL);
+ if (!data) {
+ ret = NVM_IO_ERR;
+ goto free_lba_list;
+ }
+
+ /* Read from GC victim block */
+ if (pblk_submit_read_gc(pblk, lba_list, data, nr_secs,
+ &secs_to_gc, line)) {
+ ret = NVM_IO_ERR;
+ goto free_data;
+ }
+
+ if (!secs_to_gc)
+ goto free_data;
+
+ gc_rq = kmalloc(sizeof(struct pblk_gc_rq), GFP_KERNEL);
+ if (!gc_rq) {
+ ret = NVM_IO_ERR;
+ goto free_data;
+ }
+
+ gc_rq->line = line;
+ gc_rq->data = data;
+ gc_rq->lba_list = lba_list;
+ gc_rq->nr_secs = nr_secs;
+ gc_rq->secs_to_gc = secs_to_gc;
+
+ kref_get(&line->ref);
+
+retry:
+ spin_lock(&gc->w_lock);
+ if (gc->w_entries > 256) {
+ spin_unlock(&gc->w_lock);
+ usleep_range(256, 1024);
+ goto retry;
+ }
+ gc->w_entries++;
+ list_add_tail(&gc_rq->list, &gc->w_list);
+ spin_unlock(&gc->w_lock);
+
+ pblk_gc_writer_kick(&pblk->gc);
+
+ return NVM_IO_OK;
+
+free_data:
+ kfree(data);
+free_lba_list:
+ kfree(lba_list);
+
+ return ret;
+}
+
+static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct list_head *move_list;
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_GC);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ if (move_list) {
+ spin_lock(&l_mg->gc_lock);
+ list_add_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+ }
+}
+
+static void pblk_gc_line_ws(struct work_struct *work)
+{
+ struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws,
+ ws);
+ struct pblk *pblk = line_ws->pblk;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line = line_ws->line;
+ struct pblk_line_meta *lm = &pblk->lm;
+ __le64 *lba_list = line_ws->priv;
+ u64 *gc_list;
+ int sec_left;
+ int nr_ppas, bit;
+ int put_line = 1;
+
+ pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id);
+
+ spin_lock(&line->lock);
+ sec_left = line->vsc;
+ if (!sec_left) {
+ /* Lines are erased before being used (l_mg->data_/log_next) */
+ spin_unlock(&line->lock);
+ goto out;
+ }
+ spin_unlock(&line->lock);
+
+ if (sec_left < 0) {
+ pr_err("pblk: corrupted GC line (%d)\n", line->id);
+ put_line = 0;
+ pblk_put_line_back(pblk, line);
+ goto out;
+ }
+
+ bit = -1;
+next_rq:
+ gc_list = kmalloc_array(pblk->max_write_pgs, sizeof(u64), GFP_KERNEL);
+ if (!gc_list) {
+ put_line = 0;
+ pblk_put_line_back(pblk, line);
+ goto out;
+ }
+
+ nr_ppas = 0;
+ do {
+ bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line,
+ bit + 1);
+ if (bit > line->emeta_ssec)
+ break;
+
+ gc_list[nr_ppas++] = le64_to_cpu(lba_list[bit]);
+ } while (nr_ppas < pblk->max_write_pgs);
+
+ if (unlikely(!nr_ppas)) {
+ kfree(gc_list);
+ goto out;
+ }
+
+ if (pblk_gc_move_valid_secs(pblk, line, gc_list, nr_ppas)) {
+ pr_err("pblk: could not GC all sectors: line:%d (%d/%d/%d)\n",
+ line->id, line->vsc,
+ nr_ppas, nr_ppas);
+ put_line = 0;
+ pblk_put_line_back(pblk, line);
+ goto out;
+ }
+
+ sec_left -= nr_ppas;
+ if (sec_left > 0)
+ goto next_rq;
+
+out:
+ pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+ mempool_free(line_ws, pblk->line_ws_pool);
+ atomic_dec(&pblk->gc.inflight_gc);
+ if (put_line)
+ kref_put(&line->ref, pblk_line_put);
+}
+
+static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_ws *line_ws;
+ __le64 *lba_list;
+ int ret;
+
+ line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL);
+ line->emeta = pblk_malloc(lm->emeta_len, l_mg->emeta_alloc_type,
+ GFP_KERNEL);
+ if (!line->emeta) {
+ pr_err("pblk: cannot use GC emeta\n");
+ goto fail_free_ws;
+ }
+
+ ret = pblk_line_read_emeta(pblk, line);
+ if (ret) {
+ pr_err("pblk: line %d read emeta failed (%d)\n", line->id, ret);
+ goto fail_free_emeta;
+ }
+
+ /* If this read fails, it means that emeta is corrupted. For now, leave
+ * the line untouched. TODO: Implement a recovery routine that scans and
+ * moves all sectors on the line.
+ */
+ lba_list = pblk_recov_get_lba_list(pblk, line->emeta);
+ if (!lba_list) {
+ pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
+ goto fail_free_emeta;
+ }
+
+ line_ws->pblk = pblk;
+ line_ws->line = line;
+ line_ws->priv = lba_list;
+
+ INIT_WORK(&line_ws->ws, pblk_gc_line_ws);
+ queue_work(pblk->gc.gc_reader_wq, &line_ws->ws);
+
+ return 0;
+
+fail_free_emeta:
+ pblk_mfree(line->emeta, l_mg->emeta_alloc_type);
+fail_free_ws:
+ mempool_free(line_ws, pblk->line_ws_pool);
+ pblk_put_line_back(pblk, line);
+
+ return 1;
+}
+
+static void pblk_gc_lines(struct pblk *pblk, struct list_head *gc_list)
+{
+ struct pblk_line *line, *tline;
+
+ list_for_each_entry_safe(line, tline, gc_list, list) {
+ if (pblk_gc_line(pblk, line))
+ pr_err("pblk: failed to GC line %d\n", line->id);
+ list_del(&line->list);
+ }
+}
+
+/*
+ * Lines with no valid sectors will be returned to the free list immediately. If
+ * GC is activated - either because the free block count is under the determined
+ * threshold, or because it is being forced from user space - only lines with a
+ * high count of invalid sectors will be recycled.
+ */
+static void pblk_gc_run(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_gc *gc = &pblk->gc;
+ struct pblk_line *line, *tline;
+ unsigned int nr_blocks_free, nr_blocks_need;
+ struct list_head *group_list;
+ int run_gc, gc_group = 0;
+ int prev_gc = 0;
+ int inflight_gc = atomic_read(&gc->inflight_gc);
+ LIST_HEAD(gc_list);
+
+ spin_lock(&l_mg->gc_lock);
+ list_for_each_entry_safe(line, tline, &l_mg->gc_full_list, list) {
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+ line->state = PBLK_LINESTATE_GC;
+ spin_unlock(&line->lock);
+
+ list_del(&line->list);
+ kref_put(&line->ref, pblk_line_put);
+ }
+ spin_unlock(&l_mg->gc_lock);
+
+ nr_blocks_need = pblk_rl_gc_thrs(&pblk->rl);
+ nr_blocks_free = pblk_rl_nr_free_blks(&pblk->rl);
+ run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+
+next_gc_group:
+ group_list = l_mg->gc_lists[gc_group++];
+ spin_lock(&l_mg->gc_lock);
+ while (run_gc && !list_empty(group_list)) {
+ /* No need to queue up more GC lines than we can handle */
+ if (!run_gc || inflight_gc > gc->gc_jobs_active) {
+ spin_unlock(&l_mg->gc_lock);
+ pblk_gc_lines(pblk, &gc_list);
+ return;
+ }
+
+ line = list_first_entry(group_list, struct pblk_line, list);
+ nr_blocks_free += atomic_read(&line->blk_in_line);
+
+ spin_lock(&line->lock);
+ WARN_ON(line->state != PBLK_LINESTATE_CLOSED);
+ line->state = PBLK_LINESTATE_GC;
+ list_move_tail(&line->list, &gc_list);
+ atomic_inc(&gc->inflight_gc);
+ inflight_gc++;
+ spin_unlock(&line->lock);
+
+ prev_gc = 1;
+ run_gc = (nr_blocks_need > nr_blocks_free || gc->gc_forced);
+ }
+ spin_unlock(&l_mg->gc_lock);
+
+ pblk_gc_lines(pblk, &gc_list);
+
+ if (!prev_gc && pblk->rl.rb_state > gc_group &&
+ gc_group < PBLK_NR_GC_LISTS)
+ goto next_gc_group;
+}
+
+
+static void pblk_gc_kick(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ wake_up_process(gc->gc_ts);
+ pblk_gc_writer_kick(gc);
+ mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+}
+
+static void pblk_gc_timer(unsigned long data)
+{
+ struct pblk *pblk = (struct pblk *)data;
+
+ pblk_gc_kick(pblk);
+}
+
+static int pblk_gc_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ pblk_gc_run(pblk);
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
+
+static int pblk_gc_writer_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_gc_write(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
+
+static void pblk_gc_start(struct pblk *pblk)
+{
+ pblk->gc.gc_active = 1;
+
+ pr_debug("pblk: gc start\n");
+}
+
+int pblk_gc_status(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int ret;
+
+ spin_lock(&gc->lock);
+ ret = gc->gc_active;
+ spin_unlock(&gc->lock);
+
+ return ret;
+}
+
+static void __pblk_gc_should_start(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ lockdep_assert_held(&gc->lock);
+
+ if (gc->gc_enabled && !gc->gc_active)
+ pblk_gc_start(pblk);
+}
+
+void pblk_gc_should_start(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&gc->lock);
+ __pblk_gc_should_start(pblk);
+ spin_unlock(&gc->lock);
+}
+
+/*
+ * If flush_wq == 1 then no lock should be held by the caller since
+ * flush_workqueue can sleep
+ */
+static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
+{
+ spin_lock(&pblk->gc.lock);
+ pblk->gc.gc_active = 0;
+ spin_unlock(&pblk->gc.lock);
+
+ pr_debug("pblk: gc stop\n");
+}
+
+void pblk_gc_should_stop(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ if (gc->gc_active && !gc->gc_forced)
+ pblk_gc_stop(pblk, 0);
+}
+
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ spin_lock(&gc->lock);
+ *gc_enabled = gc->gc_enabled;
+ *gc_active = gc->gc_active;
+ spin_unlock(&gc->lock);
+}
+
+void pblk_gc_sysfs_force(struct pblk *pblk, int force)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int rsv = 0;
+
+ spin_lock(&gc->lock);
+ if (force) {
+ gc->gc_enabled = 1;
+ rsv = 64;
+ }
+ pblk_rl_set_gc_rsc(&pblk->rl, rsv);
+ gc->gc_forced = force;
+ __pblk_gc_should_start(pblk);
+ spin_unlock(&gc->lock);
+}
+
+int pblk_gc_init(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ int ret;
+
+ gc->gc_ts = kthread_create(pblk_gc_ts, pblk, "pblk-gc-ts");
+ if (IS_ERR(gc->gc_ts)) {
+ pr_err("pblk: could not allocate GC main kthread\n");
+ return PTR_ERR(gc->gc_ts);
+ }
+
+ gc->gc_writer_ts = kthread_create(pblk_gc_writer_ts, pblk,
+ "pblk-gc-writer-ts");
+ if (IS_ERR(gc->gc_writer_ts)) {
+ pr_err("pblk: could not allocate GC writer kthread\n");
+ ret = PTR_ERR(gc->gc_writer_ts);
+ goto fail_free_main_kthread;
+ }
+
+ setup_timer(&gc->gc_timer, pblk_gc_timer, (unsigned long)pblk);
+ mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS));
+
+ gc->gc_active = 0;
+ gc->gc_forced = 0;
+ gc->gc_enabled = 1;
+ gc->gc_jobs_active = 8;
+ gc->w_entries = 0;
+ atomic_set(&gc->inflight_gc, 0);
+
+ gc->gc_reader_wq = alloc_workqueue("pblk-gc-reader-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, gc->gc_jobs_active);
+ if (!gc->gc_reader_wq) {
+ pr_err("pblk: could not allocate GC reader workqueue\n");
+ ret = -ENOMEM;
+ goto fail_free_writer_kthread;
+ }
+
+ spin_lock_init(&gc->lock);
+ spin_lock_init(&gc->w_lock);
+ INIT_LIST_HEAD(&gc->w_list);
+
+ return 0;
+
+fail_free_writer_kthread:
+ kthread_stop(gc->gc_writer_ts);
+fail_free_main_kthread:
+ kthread_stop(gc->gc_ts);
+
+ return ret;
+}
+
+void pblk_gc_exit(struct pblk *pblk)
+{
+ struct pblk_gc *gc = &pblk->gc;
+
+ flush_workqueue(gc->gc_reader_wq);
+
+ del_timer(&gc->gc_timer);
+ pblk_gc_stop(pblk, 1);
+
+ if (gc->gc_ts)
+ kthread_stop(gc->gc_ts);
+
+ if (pblk->gc.gc_reader_wq)
+ destroy_workqueue(pblk->gc.gc_reader_wq);
+
+ if (gc->gc_writer_ts)
+ kthread_stop(gc->gc_writer_ts);
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
new file mode 100644
index 0000000000000..ae8cd6d5af8b2
--- /dev/null
+++ b/drivers/lightnvm/pblk-init.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.c)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-init.c - pblk's initialization.
+ */
+
+#include "pblk.h"
+
+static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_r_rq_cache,
+ *pblk_w_rq_cache, *pblk_line_meta_cache;
+static DECLARE_RWSEM(pblk_lock);
+
+static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
+ struct bio *bio)
+{
+ int ret;
+
+ /* Read requests must be <= 256kb due to NVMe's 64 bit completion bitmap
+ * constraint. Writes can be of arbitrary size.
+ */
+ if (bio_data_dir(bio) == READ) {
+ blk_queue_split(q, &bio, q->bio_split);
+ ret = pblk_submit_read(pblk, bio);
+ if (ret == NVM_IO_DONE && bio_flagged(bio, BIO_CLONED))
+ bio_put(bio);
+
+ return ret;
+ }
+
+ /* Prevent deadlock in the case of a modest LUN configuration and large
+ * user I/Os. Unless stalled, the rate limiter leaves at least 256KB
+ * available for user I/O.
+ */
+ if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl)))
+ blk_queue_split(q, &bio, q->bio_split);
+
+ return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER);
+}
+
+static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio)
+{
+ struct pblk *pblk = q->queuedata;
+
+ if (bio_op(bio) == REQ_OP_DISCARD) {
+ pblk_discard(pblk, bio);
+ if (!(bio->bi_opf & REQ_PREFLUSH)) {
+ bio_endio(bio);
+ return BLK_QC_T_NONE;
+ }
+ }
+
+ switch (pblk_rw_io(q, pblk, bio)) {
+ case NVM_IO_ERR:
+ bio_io_error(bio);
+ break;
+ case NVM_IO_DONE:
+ bio_endio(bio);
+ break;
+ }
+
+ return BLK_QC_T_NONE;
+}
+
+static void pblk_l2p_free(struct pblk *pblk)
+{
+ vfree(pblk->trans_map);
+}
+
+static int pblk_l2p_init(struct pblk *pblk)
+{
+ sector_t i;
+ struct ppa_addr ppa;
+ int entry_size = 8;
+
+ if (pblk->ppaf_bitsize < 32)
+ entry_size = 4;
+
+ pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs);
+ if (!pblk->trans_map)
+ return -ENOMEM;
+
+ pblk_ppa_set_empty(&ppa);
+
+ for (i = 0; i < pblk->rl.nr_secs; i++)
+ pblk_trans_map_set(pblk, i, ppa);
+
+ return 0;
+}
+
+static void pblk_rwb_free(struct pblk *pblk)
+{
+ if (pblk_rb_tear_down_check(&pblk->rwb))
+ pr_err("pblk: write buffer error on tear down\n");
+
+ pblk_rb_data_free(&pblk->rwb);
+ vfree(pblk_rb_entries_ref(&pblk->rwb));
+}
+
+static int pblk_rwb_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_rb_entry *entries;
+ unsigned long nr_entries;
+ unsigned int power_size, power_seg_sz;
+
+ nr_entries = pblk_rb_calculate_size(pblk->pgs_in_buffer);
+
+ entries = vzalloc(nr_entries * sizeof(struct pblk_rb_entry));
+ if (!entries)
+ return -ENOMEM;
+
+ power_size = get_count_order(nr_entries);
+ power_seg_sz = get_count_order(geo->sec_size);
+
+ return pblk_rb_init(&pblk->rwb, entries, power_size, power_seg_sz);
+}
+
+/* Minimum pages needed within a lun */
+#define PAGE_POOL_SIZE 16
+#define ADDR_POOL_SIZE 64
+
+static int pblk_set_ppaf(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_addr_format ppaf = geo->ppaf;
+ int power_len;
+
+ /* Re-calculate channel and lun format to adapt to configuration */
+ power_len = get_count_order(geo->nr_chnls);
+ if (1 << power_len != geo->nr_chnls) {
+ pr_err("pblk: supports only power-of-two channel config.\n");
+ return -EINVAL;
+ }
+ ppaf.ch_len = power_len;
+
+ power_len = get_count_order(geo->luns_per_chnl);
+ if (1 << power_len != geo->luns_per_chnl) {
+ pr_err("pblk: supports only power-of-two LUN config.\n");
+ return -EINVAL;
+ }
+ ppaf.lun_len = power_len;
+
+ pblk->ppaf.sec_offset = 0;
+ pblk->ppaf.pln_offset = ppaf.sect_len;
+ pblk->ppaf.ch_offset = pblk->ppaf.pln_offset + ppaf.pln_len;
+ pblk->ppaf.lun_offset = pblk->ppaf.ch_offset + ppaf.ch_len;
+ pblk->ppaf.pg_offset = pblk->ppaf.lun_offset + ppaf.lun_len;
+ pblk->ppaf.blk_offset = pblk->ppaf.pg_offset + ppaf.pg_len;
+ pblk->ppaf.sec_mask = (1ULL << ppaf.sect_len) - 1;
+ pblk->ppaf.pln_mask = ((1ULL << ppaf.pln_len) - 1) <<
+ pblk->ppaf.pln_offset;
+ pblk->ppaf.ch_mask = ((1ULL << ppaf.ch_len) - 1) <<
+ pblk->ppaf.ch_offset;
+ pblk->ppaf.lun_mask = ((1ULL << ppaf.lun_len) - 1) <<
+ pblk->ppaf.lun_offset;
+ pblk->ppaf.pg_mask = ((1ULL << ppaf.pg_len) - 1) <<
+ pblk->ppaf.pg_offset;
+ pblk->ppaf.blk_mask = ((1ULL << ppaf.blk_len) - 1) <<
+ pblk->ppaf.blk_offset;
+
+ pblk->ppaf_bitsize = pblk->ppaf.blk_offset + ppaf.blk_len;
+
+ return 0;
+}
+
+static int pblk_init_global_caches(struct pblk *pblk)
+{
+ char cache_name[PBLK_CACHE_NAME_LEN];
+
+ down_write(&pblk_lock);
+ pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws",
+ sizeof(struct pblk_line_ws), 0, 0, NULL);
+ if (!pblk_blk_ws_cache) {
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_rec_cache = kmem_cache_create("pblk_rec",
+ sizeof(struct pblk_rec_ctx), 0, 0, NULL);
+ if (!pblk_rec_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_r_rq_cache = kmem_cache_create("pblk_r_rq", pblk_r_rq_size,
+ 0, 0, NULL);
+ if (!pblk_r_rq_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size,
+ 0, 0, NULL);
+ if (!pblk_w_rq_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+
+ snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s",
+ pblk->disk->disk_name);
+ pblk_line_meta_cache = kmem_cache_create(cache_name,
+ pblk->lm.sec_bitmap_len, 0, 0, NULL);
+ if (!pblk_line_meta_cache) {
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ kmem_cache_destroy(pblk_w_rq_cache);
+ up_write(&pblk_lock);
+ return -ENOMEM;
+ }
+ up_write(&pblk_lock);
+
+ return 0;
+}
+
+static int pblk_core_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int max_write_ppas;
+ int mod;
+
+ pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
+ max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+ pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
+ max_write_ppas : nvm_max_phys_sects(dev);
+ pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
+ geo->nr_planes * geo->nr_luns;
+
+ if (pblk->max_write_pgs > PBLK_MAX_REQ_ADDRS) {
+ pr_err("pblk: cannot support device max_phys_sect\n");
+ return -EINVAL;
+ }
+
+ div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+ if (mod) {
+ pr_err("pblk: bad configuration of sectors/pages\n");
+ return -EINVAL;
+ }
+
+ if (pblk_init_global_caches(pblk))
+ return -ENOMEM;
+
+ pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
+ if (!pblk->page_pool)
+ return -ENOMEM;
+
+ pblk->line_ws_pool = mempool_create_slab_pool(geo->nr_luns,
+ pblk_blk_ws_cache);
+ if (!pblk->line_ws_pool)
+ goto free_page_pool;
+
+ pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+ if (!pblk->rec_pool)
+ goto free_blk_ws_pool;
+
+ pblk->r_rq_pool = mempool_create_slab_pool(64, pblk_r_rq_cache);
+ if (!pblk->r_rq_pool)
+ goto free_rec_pool;
+
+ pblk->w_rq_pool = mempool_create_slab_pool(64, pblk_w_rq_cache);
+ if (!pblk->w_rq_pool)
+ goto free_r_rq_pool;
+
+ pblk->line_meta_pool =
+ mempool_create_slab_pool(16, pblk_line_meta_cache);
+ if (!pblk->line_meta_pool)
+ goto free_w_rq_pool;
+
+ pblk->kw_wq = alloc_workqueue("pblk-aux-wq",
+ WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
+ if (!pblk->kw_wq)
+ goto free_line_meta_pool;
+
+ if (pblk_set_ppaf(pblk))
+ goto free_kw_wq;
+
+ if (pblk_rwb_init(pblk))
+ goto free_kw_wq;
+
+ INIT_LIST_HEAD(&pblk->compl_list);
+ return 0;
+
+free_kw_wq:
+ destroy_workqueue(pblk->kw_wq);
+free_line_meta_pool:
+ mempool_destroy(pblk->line_meta_pool);
+free_w_rq_pool:
+ mempool_destroy(pblk->w_rq_pool);
+free_r_rq_pool:
+ mempool_destroy(pblk->r_rq_pool);
+free_rec_pool:
+ mempool_destroy(pblk->rec_pool);
+free_blk_ws_pool:
+ mempool_destroy(pblk->line_ws_pool);
+free_page_pool:
+ mempool_destroy(pblk->page_pool);
+ return -ENOMEM;
+}
+
+static void pblk_core_free(struct pblk *pblk)
+{
+ if (pblk->kw_wq)
+ destroy_workqueue(pblk->kw_wq);
+
+ mempool_destroy(pblk->page_pool);
+ mempool_destroy(pblk->line_ws_pool);
+ mempool_destroy(pblk->rec_pool);
+ mempool_destroy(pblk->r_rq_pool);
+ mempool_destroy(pblk->w_rq_pool);
+ mempool_destroy(pblk->line_meta_pool);
+
+ kmem_cache_destroy(pblk_blk_ws_cache);
+ kmem_cache_destroy(pblk_rec_cache);
+ kmem_cache_destroy(pblk_r_rq_cache);
+ kmem_cache_destroy(pblk_w_rq_cache);
+ kmem_cache_destroy(pblk_line_meta_cache);
+}
+
+static void pblk_luns_free(struct pblk *pblk)
+{
+ kfree(pblk->luns);
+}
+
+static void pblk_lines_free(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ int i;
+
+ spin_lock(&l_mg->free_lock);
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ line = &pblk->lines[i];
+
+ pblk_line_free(pblk, line);
+ kfree(line->blk_bitmap);
+ kfree(line->erase_bitmap);
+ }
+ spin_unlock(&l_mg->free_lock);
+}
+
+static void pblk_line_meta_free(struct pblk *pblk)
+{
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ int i;
+
+ kfree(l_mg->bb_template);
+ kfree(l_mg->bb_aux);
+
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+ pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+ }
+
+ kfree(pblk->lines);
+}
+
+static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
+{
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr ppa;
+ u8 *blks;
+ int nr_blks, ret;
+
+ nr_blks = geo->blks_per_lun * geo->plane_mode;
+ blks = kmalloc(nr_blks, GFP_KERNEL);
+ if (!blks)
+ return -ENOMEM;
+
+ ppa.ppa = 0;
+ ppa.g.ch = rlun->bppa.g.ch;
+ ppa.g.lun = rlun->bppa.g.lun;
+
+ ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
+ if (ret)
+ goto out;
+
+ nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
+ if (nr_blks < 0) {
+ ret = nr_blks;
+ goto out;
+ }
+
+ rlun->bb_list = blks;
+
+ return 0;
+out:
+ kfree(blks);
+ return ret;
+}
+
+static int pblk_bb_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_lun *rlun;
+ int bb_cnt = 0;
+ int i;
+
+ line->blk_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+ if (!line->blk_bitmap)
+ return -ENOMEM;
+
+ line->erase_bitmap = kzalloc(lm->blk_bitmap_len, GFP_KERNEL);
+ if (!line->erase_bitmap) {
+ kfree(line->blk_bitmap);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < lm->blk_per_line; i++) {
+ rlun = &pblk->luns[i];
+ if (rlun->bb_list[line->id] == NVM_BLK_T_FREE)
+ continue;
+
+ set_bit(i, line->blk_bitmap);
+ bb_cnt++;
+ }
+
+ return bb_cnt;
+}
+
+static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ int i, ret;
+
+ /* TODO: Implement unbalanced LUN support */
+ if (geo->luns_per_chnl < 0) {
+ pr_err("pblk: unbalanced LUN config.\n");
+ return -EINVAL;
+ }
+
+ pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+ if (!pblk->luns)
+ return -ENOMEM;
+
+ for (i = 0; i < geo->nr_luns; i++) {
+ /* Stripe across channels */
+ int ch = i % geo->nr_chnls;
+ int lun_raw = i / geo->nr_chnls;
+ int lunid = lun_raw + ch * geo->luns_per_chnl;
+
+ rlun = &pblk->luns[i];
+ rlun->bppa = luns[lunid];
+
+ sema_init(&rlun->wr_sem, 1);
+
+ ret = pblk_bb_discovery(dev, rlun);
+ if (ret) {
+ while (--i >= 0)
+ kfree(pblk->luns[i].bb_list);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int pblk_lines_configure(struct pblk *pblk, int flags)
+{
+ struct pblk_line *line = NULL;
+ int ret = 0;
+
+ if (!(flags & NVM_TARGET_FACTORY)) {
+ line = pblk_recov_l2p(pblk);
+ if (IS_ERR(line)) {
+ pr_err("pblk: could not recover l2p table\n");
+ ret = -EFAULT;
+ }
+ }
+
+ if (!line) {
+ /* Configure next line for user data */
+ line = pblk_line_get_first_data(pblk);
+ if (!line) {
+ pr_err("pblk: line list corrupted\n");
+ ret = -EFAULT;
+ }
+ }
+
+ return ret;
+}
+
+/* See comment over struct line_emeta definition */
+static unsigned int calc_emeta_len(struct pblk *pblk, struct pblk_line_meta *lm)
+{
+ return (sizeof(struct line_emeta) +
+ ((lm->sec_per_line - lm->emeta_sec) * sizeof(u64)) +
+ (pblk->l_mg.nr_lines * sizeof(u32)) +
+ lm->blk_bitmap_len);
+}
+
+static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ sector_t provisioned;
+
+ pblk->over_pct = 20;
+
+ provisioned = nr_free_blks;
+ provisioned *= (100 - pblk->over_pct);
+ sector_div(provisioned, 100);
+
+ /* Internally pblk manages all free blocks, but all calculations based
+ * on user capacity consider only provisioned blocks
+ */
+ pblk->rl.total_blocks = nr_free_blks;
+ pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
+ pblk->capacity = provisioned * geo->sec_per_blk;
+ atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+}
+
+static int pblk_lines_init(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *line;
+ unsigned int smeta_len, emeta_len;
+ long nr_bad_blks, nr_meta_blks, nr_free_blks;
+ int bb_distance;
+ int i;
+ int ret;
+
+ lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
+ lm->blk_per_line = geo->nr_luns;
+ lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+ lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
+ lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+ lm->high_thrs = lm->sec_per_line / 2;
+ lm->mid_thrs = lm->sec_per_line / 4;
+
+ /* Calculate necessary pages for smeta. See comment over struct
+ * line_smeta definition
+ */
+ lm->smeta_len = sizeof(struct line_smeta) +
+ PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+
+ i = 1;
+add_smeta_page:
+ lm->smeta_sec = i * geo->sec_per_pl;
+ lm->smeta_len = lm->smeta_sec * geo->sec_size;
+
+ smeta_len = sizeof(struct line_smeta) +
+ PBLK_LINE_NR_LUN_BITMAP * lm->lun_bitmap_len;
+ if (smeta_len > lm->smeta_len) {
+ i++;
+ goto add_smeta_page;
+ }
+
+ /* Calculate necessary pages for emeta. See comment over struct
+ * line_emeta definition
+ */
+ i = 1;
+add_emeta_page:
+ lm->emeta_sec = i * geo->sec_per_pl;
+ lm->emeta_len = lm->emeta_sec * geo->sec_size;
+
+ emeta_len = calc_emeta_len(pblk, lm);
+ if (emeta_len > lm->emeta_len) {
+ i++;
+ goto add_emeta_page;
+ }
+ lm->emeta_bb = geo->nr_luns - i;
+
+ nr_meta_blks = (lm->smeta_sec + lm->emeta_sec +
+ (geo->sec_per_blk / 2)) / geo->sec_per_blk;
+ lm->min_blk_line = nr_meta_blks + 1;
+
+ l_mg->nr_lines = geo->blks_per_lun;
+ l_mg->log_line = l_mg->data_line = NULL;
+ l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
+ l_mg->nr_free_lines = 0;
+ bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+
+ /* smeta is always small enough to fit on a kmalloc memory allocation,
+ * emeta depends on the number of LUNs allocated to the pblk instance
+ */
+ l_mg->smeta_alloc_type = PBLK_KMALLOC_META;
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ l_mg->sline_meta[i].meta = kmalloc(lm->smeta_len, GFP_KERNEL);
+ if (!l_mg->sline_meta[i].meta)
+ while (--i >= 0) {
+ kfree(l_mg->sline_meta[i].meta);
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ if (lm->emeta_len > KMALLOC_MAX_CACHE_SIZE) {
+ l_mg->emeta_alloc_type = PBLK_VMALLOC_META;
+
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ l_mg->eline_meta[i].meta = vmalloc(lm->emeta_len);
+ if (!l_mg->eline_meta[i].meta)
+ while (--i >= 0) {
+ vfree(l_mg->eline_meta[i].meta);
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+ } else {
+ l_mg->emeta_alloc_type = PBLK_KMALLOC_META;
+
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ l_mg->eline_meta[i].meta =
+ kmalloc(lm->emeta_len, GFP_KERNEL);
+ if (!l_mg->eline_meta[i].meta)
+ while (--i >= 0) {
+ kfree(l_mg->eline_meta[i].meta);
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+ }
+
+ l_mg->bb_template = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!l_mg->bb_template) {
+ ret = -ENOMEM;
+ goto fail_free_meta;
+ }
+
+ l_mg->bb_aux = kzalloc(lm->sec_bitmap_len, GFP_KERNEL);
+ if (!l_mg->bb_aux) {
+ ret = -ENOMEM;
+ goto fail_free_bb_template;
+ }
+
+ bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+ for (i = 0; i < lm->sec_per_line; i += bb_distance)
+ bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
+
+ INIT_LIST_HEAD(&l_mg->free_list);
+ INIT_LIST_HEAD(&l_mg->corrupt_list);
+ INIT_LIST_HEAD(&l_mg->bad_list);
+ INIT_LIST_HEAD(&l_mg->gc_full_list);
+ INIT_LIST_HEAD(&l_mg->gc_high_list);
+ INIT_LIST_HEAD(&l_mg->gc_mid_list);
+ INIT_LIST_HEAD(&l_mg->gc_low_list);
+ INIT_LIST_HEAD(&l_mg->gc_empty_list);
+
+ l_mg->gc_lists[0] = &l_mg->gc_high_list;
+ l_mg->gc_lists[1] = &l_mg->gc_mid_list;
+ l_mg->gc_lists[2] = &l_mg->gc_low_list;
+
+ spin_lock_init(&l_mg->free_lock);
+ spin_lock_init(&l_mg->gc_lock);
+
+ pblk->lines = kcalloc(l_mg->nr_lines, sizeof(struct pblk_line),
+ GFP_KERNEL);
+ if (!pblk->lines) {
+ ret = -ENOMEM;
+ goto fail_free_bb_aux;
+ }
+
+ nr_free_blks = 0;
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ int blk_in_line;
+
+ line = &pblk->lines[i];
+
+ line->pblk = pblk;
+ line->id = i;
+ line->type = PBLK_LINETYPE_FREE;
+ line->state = PBLK_LINESTATE_FREE;
+ line->gc_group = PBLK_LINEGC_NONE;
+ spin_lock_init(&line->lock);
+
+ nr_bad_blks = pblk_bb_line(pblk, line);
+ if (nr_bad_blks < 0 || nr_bad_blks > lm->blk_per_line) {
+ ret = -EINVAL;
+ goto fail_free_lines;
+ }
+
+ blk_in_line = lm->blk_per_line - nr_bad_blks;
+ if (blk_in_line < lm->min_blk_line) {
+ line->state = PBLK_LINESTATE_BAD;
+ list_add_tail(&line->list, &l_mg->bad_list);
+ continue;
+ }
+
+ nr_free_blks += blk_in_line;
+ atomic_set(&line->blk_in_line, blk_in_line);
+
+ l_mg->nr_free_lines++;
+ list_add_tail(&line->list, &l_mg->free_list);
+ }
+
+ pblk_set_provision(pblk, nr_free_blks);
+
+ sema_init(&pblk->erase_sem, 1);
+
+ /* Cleanup per-LUN bad block lists - managed within lines on run-time */
+ for (i = 0; i < geo->nr_luns; i++)
+ kfree(pblk->luns[i].bb_list);
+
+ return 0;
+fail_free_lines:
+ kfree(pblk->lines);
+fail_free_bb_aux:
+ kfree(l_mg->bb_aux);
+fail_free_bb_template:
+ kfree(l_mg->bb_template);
+fail_free_meta:
+ for (i = 0; i < PBLK_DATA_LINES; i++) {
+ pblk_mfree(l_mg->sline_meta[i].meta, l_mg->smeta_alloc_type);
+ pblk_mfree(l_mg->eline_meta[i].meta, l_mg->emeta_alloc_type);
+ }
+fail:
+ for (i = 0; i < geo->nr_luns; i++)
+ kfree(pblk->luns[i].bb_list);
+
+ return ret;
+}
+
+static int pblk_writer_init(struct pblk *pblk)
+{
+ setup_timer(&pblk->wtimer, pblk_write_timer_fn, (unsigned long)pblk);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
+ pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
+ if (IS_ERR(pblk->writer_ts)) {
+ pr_err("pblk: could not allocate writer kthread\n");
+ return PTR_ERR(pblk->writer_ts);
+ }
+
+ return 0;
+}
+
+static void pblk_writer_stop(struct pblk *pblk)
+{
+ if (pblk->writer_ts)
+ kthread_stop(pblk->writer_ts);
+ del_timer(&pblk->wtimer);
+}
+
+static void pblk_free(struct pblk *pblk)
+{
+ pblk_luns_free(pblk);
+ pblk_lines_free(pblk);
+ pblk_line_meta_free(pblk);
+ pblk_core_free(pblk);
+ pblk_l2p_free(pblk);
+
+ kfree(pblk);
+}
+
+static void pblk_tear_down(struct pblk *pblk)
+{
+ pblk_flush_writer(pblk);
+ pblk_writer_stop(pblk);
+ pblk_rb_sync_l2p(&pblk->rwb);
+ pblk_recov_pad(pblk);
+ pblk_rwb_free(pblk);
+ pblk_rl_free(&pblk->rl);
+
+ pr_debug("pblk: consistent tear down\n");
+}
+
+static void pblk_exit(void *private)
+{
+ struct pblk *pblk = private;
+
+ down_write(&pblk_lock);
+ pblk_gc_exit(pblk);
+ pblk_tear_down(pblk);
+ pblk_free(pblk);
+ up_write(&pblk_lock);
+}
+
+static sector_t pblk_capacity(void *private)
+{
+ struct pblk *pblk = private;
+
+ return pblk->capacity * NR_PHY_IN_LOG;
+}
+
+static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+ int flags)
+{
+ struct nvm_geo *geo = &dev->geo;
+ struct request_queue *bqueue = dev->q;
+ struct request_queue *tqueue = tdisk->queue;
+ struct pblk *pblk;
+ int ret;
+
+ if (dev->identity.dom & NVM_RSP_L2P) {
+ pr_err("pblk: device-side L2P table not supported. (%x)\n",
+ dev->identity.dom);
+ return ERR_PTR(-EINVAL);
+ }
+
+ pblk = kzalloc(sizeof(struct pblk), GFP_KERNEL);
+ if (!pblk)
+ return ERR_PTR(-ENOMEM);
+
+ pblk->dev = dev;
+ pblk->disk = tdisk;
+
+ spin_lock_init(&pblk->trans_lock);
+ spin_lock_init(&pblk->lock);
+
+ if (flags & NVM_TARGET_FACTORY)
+ pblk_setup_uuid(pblk);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_set(&pblk->inflight_writes, 0);
+ atomic_long_set(&pblk->padded_writes, 0);
+ atomic_long_set(&pblk->padded_wb, 0);
+ atomic_long_set(&pblk->nr_flush, 0);
+ atomic_long_set(&pblk->req_writes, 0);
+ atomic_long_set(&pblk->sub_writes, 0);
+ atomic_long_set(&pblk->sync_writes, 0);
+ atomic_long_set(&pblk->compl_writes, 0);
+ atomic_long_set(&pblk->inflight_reads, 0);
+ atomic_long_set(&pblk->sync_reads, 0);
+ atomic_long_set(&pblk->recov_writes, 0);
+ atomic_long_set(&pblk->recov_writes, 0);
+ atomic_long_set(&pblk->recov_gc_writes, 0);
+#endif
+
+ atomic_long_set(&pblk->read_failed, 0);
+ atomic_long_set(&pblk->read_empty, 0);
+ atomic_long_set(&pblk->read_high_ecc, 0);
+ atomic_long_set(&pblk->read_failed_gc, 0);
+ atomic_long_set(&pblk->write_failed, 0);
+ atomic_long_set(&pblk->erase_failed, 0);
+
+ ret = pblk_luns_init(pblk, dev->luns);
+ if (ret) {
+ pr_err("pblk: could not initialize luns\n");
+ goto fail;
+ }
+
+ ret = pblk_lines_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize lines\n");
+ goto fail_free_luns;
+ }
+
+ ret = pblk_core_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize core\n");
+ goto fail_free_line_meta;
+ }
+
+ ret = pblk_l2p_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize maps\n");
+ goto fail_free_core;
+ }
+
+ ret = pblk_lines_configure(pblk, flags);
+ if (ret) {
+ pr_err("pblk: could not configure lines\n");
+ goto fail_free_l2p;
+ }
+
+ ret = pblk_writer_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize write thread\n");
+ goto fail_free_lines;
+ }
+
+ ret = pblk_gc_init(pblk);
+ if (ret) {
+ pr_err("pblk: could not initialize gc\n");
+ goto fail_stop_writer;
+ }
+
+ /* inherit the size from the underlying device */
+ blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
+ blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
+
+ blk_queue_write_cache(tqueue, true, false);
+
+ tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+ tqueue->limits.discard_alignment = 0;
+ blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
+
+ pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+ geo->nr_luns, pblk->l_mg.nr_lines,
+ (unsigned long long)pblk->rl.nr_secs,
+ pblk->rwb.nr_entries);
+
+ wake_up_process(pblk->writer_ts);
+ return pblk;
+
+fail_stop_writer:
+ pblk_writer_stop(pblk);
+fail_free_lines:
+ pblk_lines_free(pblk);
+fail_free_l2p:
+ pblk_l2p_free(pblk);
+fail_free_core:
+ pblk_core_free(pblk);
+fail_free_line_meta:
+ pblk_line_meta_free(pblk);
+fail_free_luns:
+ pblk_luns_free(pblk);
+fail:
+ kfree(pblk);
+ return ERR_PTR(ret);
+}
+
+/* physical block device target */
+static struct nvm_tgt_type tt_pblk = {
+ .name = "pblk",
+ .version = {1, 0, 0},
+
+ .make_rq = pblk_make_rq,
+ .capacity = pblk_capacity,
+
+ .init = pblk_init,
+ .exit = pblk_exit,
+
+ .sysfs_init = pblk_sysfs_init,
+ .sysfs_exit = pblk_sysfs_exit,
+};
+
+static int __init pblk_module_init(void)
+{
+ return nvm_register_tgt_type(&tt_pblk);
+}
+
+static void pblk_module_exit(void)
+{
+ nvm_unregister_tgt_type(&tt_pblk);
+}
+
+module_init(pblk_module_init);
+module_exit(pblk_module_exit);
+MODULE_AUTHOR("Javier Gonzalez <javier@cnexlabs.com>");
+MODULE_AUTHOR("Matias Bjorling <matias@cnexlabs.com>");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Physical Block-Device for Open-Channel SSDs");
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
new file mode 100644
index 0000000000000..17c16955284da
--- /dev/null
+++ b/drivers/lightnvm/pblk-map.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-map.c - pblk's lba-ppa mapping strategy
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
+ struct ppa_addr *ppa_list,
+ unsigned long *lun_bitmap,
+ struct pblk_sec_meta *meta_list,
+ unsigned int valid_secs)
+{
+ struct pblk_line *line = pblk_line_get_data(pblk);
+ struct line_emeta *emeta = line->emeta;
+ struct pblk_w_ctx *w_ctx;
+ __le64 *lba_list = pblk_line_emeta_to_lbas(emeta);
+ u64 paddr;
+ int nr_secs = pblk->min_write_pgs;
+ int i;
+
+ paddr = pblk_alloc_page(pblk, line, nr_secs);
+
+ for (i = 0; i < nr_secs; i++, paddr++) {
+ /* ppa to be sent to the device */
+ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
+
+ /* Write context for target bio completion on write buffer. Note
+ * that the write buffer is protected by the sync backpointer,
+ * and a single writer thread have access to each specific entry
+ * at a time. Thus, it is safe to modify the context for the
+ * entry we are setting up for submission without taking any
+ * lock or memory barrier.
+ */
+ if (i < valid_secs) {
+ kref_get(&line->ref);
+ w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
+ w_ctx->ppa = ppa_list[i];
+ meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+ lba_list[paddr] = cpu_to_le64(w_ctx->lba);
+ le64_add_cpu(&line->emeta->nr_valid_lbas, 1);
+ } else {
+ meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+ lba_list[paddr] = cpu_to_le64(ADDR_EMPTY);
+ pblk_map_pad_invalidate(pblk, line, paddr);
+ }
+ }
+
+ if (pblk_line_is_full(line)) {
+ line = pblk_line_replace_data(pblk);
+ if (!line)
+ return;
+ }
+
+ pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap);
+}
+
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+ unsigned long *lun_bitmap, unsigned int valid_secs,
+ unsigned int off)
+{
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ unsigned int map_secs;
+ int min = pblk->min_write_pgs;
+ int i;
+
+ for (i = off; i < rqd->nr_ppas; i += min) {
+ map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+ pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs);
+ }
+}
+
+/* only if erase_ppa is set, acquire erase semaphore */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int sentry, unsigned long *lun_bitmap,
+ unsigned int valid_secs, struct ppa_addr *erase_ppa)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+ struct pblk_sec_meta *meta_list = rqd->meta_list;
+ unsigned int map_secs;
+ int min = pblk->min_write_pgs;
+ int i, erase_lun;
+
+ for (i = 0; i < rqd->nr_ppas; i += min) {
+ map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+ pblk_map_page_data(pblk, sentry + i, &rqd->ppa_list[i],
+ lun_bitmap, &meta_list[i], map_secs);
+
+ erase_lun = rqd->ppa_list[i].g.lun * geo->nr_chnls +
+ rqd->ppa_list[i].g.ch;
+
+ if (!test_bit(erase_lun, e_line->erase_bitmap)) {
+ if (down_trylock(&pblk->erase_sem))
+ continue;
+
+ set_bit(erase_lun, e_line->erase_bitmap);
+ atomic_dec(&e_line->left_eblks);
+ *erase_ppa = rqd->ppa_list[i];
+ erase_ppa->g.blk = e_line->id;
+
+ /* Avoid evaluating e_line->left_eblks */
+ return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
+ valid_secs, i + min);
+ }
+ }
+
+ /* Erase blocks that are bad in this line but might not be in next */
+ if (unlikely(ppa_empty(*erase_ppa))) {
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ i = find_first_zero_bit(e_line->erase_bitmap, lm->blk_per_line);
+ if (i == lm->blk_per_line)
+ return;
+
+ set_bit(i, e_line->erase_bitmap);
+ atomic_dec(&e_line->left_eblks);
+ *erase_ppa = pblk->luns[i].bppa; /* set ch and lun */
+ erase_ppa->g.blk = e_line->id;
+ }
+}
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
new file mode 100644
index 0000000000000..045384ddc1f90
--- /dev/null
+++ b/drivers/lightnvm/pblk-rb.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * Based upon the circular ringbuffer.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rb.c - pblk's write buffer
+ */
+
+#include <linux/circ_buf.h>
+
+#include "pblk.h"
+
+static DECLARE_RWSEM(pblk_rb_lock);
+
+void pblk_rb_data_free(struct pblk_rb *rb)
+{
+ struct pblk_rb_pages *p, *t;
+
+ down_write(&pblk_rb_lock);
+ list_for_each_entry_safe(p, t, &rb->pages, list) {
+ free_pages((unsigned long)page_address(p->pages), p->order);
+ list_del(&p->list);
+ kfree(p);
+ }
+ up_write(&pblk_rb_lock);
+}
+
+/*
+ * Initialize ring buffer. The data and metadata buffers must be previously
+ * allocated and their size must be a power of two
+ * (Documentation/circular-buffers.txt)
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ unsigned int init_entry = 0;
+ unsigned int alloc_order = power_size;
+ unsigned int max_order = MAX_ORDER - 1;
+ unsigned int order, iter;
+
+ down_write(&pblk_rb_lock);
+ rb->entries = rb_entry_base;
+ rb->seg_size = (1 << power_seg_sz);
+ rb->nr_entries = (1 << power_size);
+ rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
+ rb->sync_point = EMPTY_ENTRY;
+
+ spin_lock_init(&rb->w_lock);
+ spin_lock_init(&rb->s_lock);
+
+ INIT_LIST_HEAD(&rb->pages);
+
+ if (alloc_order >= max_order) {
+ order = max_order;
+ iter = (1 << (alloc_order - max_order));
+ } else {
+ order = alloc_order;
+ iter = 1;
+ }
+
+ do {
+ struct pblk_rb_entry *entry;
+ struct pblk_rb_pages *page_set;
+ void *kaddr;
+ unsigned long set_size;
+ int i;
+
+ page_set = kmalloc(sizeof(struct pblk_rb_pages), GFP_KERNEL);
+ if (!page_set) {
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+
+ page_set->order = order;
+ page_set->pages = alloc_pages(GFP_KERNEL, order);
+ if (!page_set->pages) {
+ kfree(page_set);
+ pblk_rb_data_free(rb);
+ up_write(&pblk_rb_lock);
+ return -ENOMEM;
+ }
+ kaddr = page_address(page_set->pages);
+
+ entry = &rb->entries[init_entry];
+ entry->data = kaddr;
+ entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+ entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+
+ set_size = (1 << order);
+ for (i = 1; i < set_size; i++) {
+ entry = &rb->entries[init_entry];
+ entry->cacheline = pblk_cacheline_to_addr(init_entry++);
+ entry->data = kaddr + (i * rb->seg_size);
+ entry->w_ctx.flags = PBLK_WRITABLE_ENTRY;
+ bio_list_init(&entry->w_ctx.bios);
+ }
+
+ list_add_tail(&page_set->list, &rb->pages);
+ iter--;
+ } while (iter > 0);
+ up_write(&pblk_rb_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_set(&rb->inflight_sync_point, 0);
+#endif
+
+ /*
+ * Initialize rate-limiter, which controls access to the write buffer
+ * but user and GC I/O
+ */
+ pblk_rl_init(&pblk->rl, rb->nr_entries);
+
+ return 0;
+}
+
+/*
+ * pblk_rb_calculate_size -- calculate the size of the write buffer
+ */
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries)
+{
+ /* Alloc a write buffer that can at least fit 128 entries */
+ return (1 << max(get_count_order(nr_entries), 7));
+}
+
+void *pblk_rb_entries_ref(struct pblk_rb *rb)
+{
+ return rb->entries;
+}
+
+static void clean_wctx(struct pblk_w_ctx *w_ctx)
+{
+ int flags;
+
+try:
+ flags = READ_ONCE(w_ctx->flags);
+ if (!(flags & PBLK_SUBMITTED_ENTRY))
+ goto try;
+
+ /* Release flags on context. Protect from writes and reads */
+ smp_store_release(&w_ctx->flags, PBLK_WRITABLE_ENTRY);
+ pblk_ppa_set_empty(&w_ctx->ppa);
+}
+
+#define pblk_rb_ring_count(head, tail, size) CIRC_CNT(head, tail, size)
+#define pblk_rb_ring_space(rb, head, tail, size) \
+ (CIRC_SPACE(head, tail, size))
+
+/*
+ * Buffer space is calculated with respect to the back pointer signaling
+ * synchronized entries to the media.
+ */
+static unsigned int pblk_rb_space(struct pblk_rb *rb)
+{
+ unsigned int mem = READ_ONCE(rb->mem);
+ unsigned int sync = READ_ONCE(rb->sync);
+
+ return pblk_rb_ring_space(rb, mem, sync, rb->nr_entries);
+}
+
+/*
+ * Buffer count is calculated with respect to the submission entry signaling the
+ * entries that are available to send to the media
+ */
+unsigned int pblk_rb_read_count(struct pblk_rb *rb)
+{
+ unsigned int mem = READ_ONCE(rb->mem);
+ unsigned int subm = READ_ONCE(rb->subm);
+
+ return pblk_rb_ring_count(mem, subm, rb->nr_entries);
+}
+
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ unsigned int subm;
+
+ subm = READ_ONCE(rb->subm);
+ /* Commit read means updating submission pointer */
+ smp_store_release(&rb->subm,
+ (subm + nr_entries) & (rb->nr_entries - 1));
+
+ return subm;
+}
+
+static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd,
+ unsigned int to_update)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_line *line;
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ unsigned int i;
+
+ for (i = 0; i < to_update; i++) {
+ entry = &rb->entries[*l2p_upd];
+ w_ctx = &entry->w_ctx;
+
+ pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
+ entry->cacheline);
+
+ line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
+ kref_put(&line->ref, pblk_line_put);
+ clean_wctx(w_ctx);
+ *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1);
+ }
+
+ return 0;
+}
+
+/*
+ * When we move the l2p_update pointer, we update the l2p table - lookups will
+ * point to the physical address instead of to the cacheline in the write buffer
+ * from this moment on.
+ */
+static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int mem, unsigned int sync)
+{
+ unsigned int space, count;
+ int ret = 0;
+
+ lockdep_assert_held(&rb->w_lock);
+
+ /* Update l2p only as buffer entries are being overwritten */
+ space = pblk_rb_ring_space(rb, mem, rb->l2p_update, rb->nr_entries);
+ if (space > nr_entries)
+ goto out;
+
+ count = nr_entries - space;
+ /* l2p_update used exclusively under rb->w_lock */
+ ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count);
+
+out:
+ return ret;
+}
+
+/*
+ * Update the l2p entry for all sectors stored on the write buffer. This means
+ * that all future lookups to the l2p table will point to a device address, not
+ * to the cacheline in the write buffer.
+ */
+void pblk_rb_sync_l2p(struct pblk_rb *rb)
+{
+ unsigned int sync;
+ unsigned int to_update;
+
+ spin_lock(&rb->w_lock);
+
+ /* Protect from reads and writes */
+ sync = smp_load_acquire(&rb->sync);
+
+ to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries);
+ __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update);
+
+ spin_unlock(&rb->w_lock);
+}
+
+/*
+ * Write @nr_entries to ring buffer from @data buffer if there is enough space.
+ * Typically, 4KB data chunks coming from a bio will be copied to the ring
+ * buffer, thus the write will fail if not all incoming data can be copied.
+ *
+ */
+static void __pblk_rb_write_entry(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx,
+ struct pblk_rb_entry *entry)
+{
+ memcpy(entry->data, data, rb->seg_size);
+
+ entry->w_ctx.lba = w_ctx.lba;
+ entry->w_ctx.ppa = w_ctx.ppa;
+}
+
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, unsigned int ring_pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ int flags;
+
+ entry = &rb->entries[ring_pos];
+ flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+ /* Caller must guarantee that the entry is free */
+ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+ __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+ pblk_update_map_cache(pblk, w_ctx.lba, entry->cacheline);
+ flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+ unsigned int ring_pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ int flags;
+
+ entry = &rb->entries[ring_pos];
+ flags = READ_ONCE(entry->w_ctx.flags);
+#ifdef CONFIG_NVM_DEBUG
+ /* Caller must guarantee that the entry is free */
+ BUG_ON(!(flags & PBLK_WRITABLE_ENTRY));
+#endif
+
+ __pblk_rb_write_entry(rb, data, w_ctx, entry);
+
+ if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line))
+ entry->w_ctx.lba = ADDR_EMPTY;
+
+ flags = w_ctx.flags | PBLK_WRITTEN_DATA;
+
+ /* Release flags on write context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+}
+
+static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
+ unsigned int pos)
+{
+ struct pblk_rb_entry *entry;
+ unsigned int subm, sync_point;
+ int flags;
+
+ subm = READ_ONCE(rb->subm);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_inc(&rb->inflight_sync_point);
+#endif
+
+ if (pos == subm)
+ return 0;
+
+ sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+ entry = &rb->entries[sync_point];
+
+ flags = READ_ONCE(entry->w_ctx.flags);
+ flags |= PBLK_FLUSH_ENTRY;
+
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+
+ /* Protect syncs */
+ smp_store_release(&rb->sync_point, sync_point);
+
+ spin_lock_irq(&rb->s_lock);
+ bio_list_add(&entry->w_ctx.bios, bio);
+ spin_unlock_irq(&rb->s_lock);
+
+ return 1;
+}
+
+static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ unsigned int mem;
+ unsigned int sync;
+
+ sync = READ_ONCE(rb->sync);
+ mem = READ_ONCE(rb->mem);
+
+ if (pblk_rb_ring_space(rb, mem, sync, rb->nr_entries) < nr_entries)
+ return 0;
+
+ if (pblk_rb_update_l2p(rb, nr_entries, mem, sync))
+ return 0;
+
+ *pos = mem;
+
+ return 1;
+}
+
+static int pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ if (!__pblk_rb_may_write(rb, nr_entries, pos))
+ return 0;
+
+ /* Protect from read count */
+ smp_store_release(&rb->mem, (*pos + nr_entries) & (rb->nr_entries - 1));
+ return 1;
+}
+
+static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos, struct bio *bio,
+ int *io_ret)
+{
+ unsigned int mem;
+
+ if (!__pblk_rb_may_write(rb, nr_entries, pos))
+ return 0;
+
+ mem = (*pos + nr_entries) & (rb->nr_entries - 1);
+ *io_ret = NVM_IO_DONE;
+
+ if (bio->bi_opf & REQ_PREFLUSH) {
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->nr_flush);
+#endif
+ if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
+ *io_ret = NVM_IO_OK;
+ }
+
+ /* Protect from read count */
+ smp_store_release(&rb->mem, mem);
+ return 1;
+}
+
+/*
+ * Atomically check that (i) there is space on the write buffer for the
+ * incoming I/O, and (ii) the current I/O type has enough budget in the write
+ * buffer (rate-limiter).
+ */
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+ unsigned int nr_entries, unsigned int *pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ int flush_done;
+
+ spin_lock(&rb->w_lock);
+ if (!pblk_rl_user_may_insert(&pblk->rl, nr_entries)) {
+ spin_unlock(&rb->w_lock);
+ return NVM_IO_REQUEUE;
+ }
+
+ if (!pblk_rb_may_write_flush(rb, nr_entries, pos, bio, &flush_done)) {
+ spin_unlock(&rb->w_lock);
+ return NVM_IO_REQUEUE;
+ }
+
+ pblk_rl_user_in(&pblk->rl, nr_entries);
+ spin_unlock(&rb->w_lock);
+
+ return flush_done;
+}
+
+/*
+ * Look at pblk_rb_may_write_user comment
+ */
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+
+ spin_lock(&rb->w_lock);
+ if (!pblk_rl_gc_may_insert(&pblk->rl, nr_entries)) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ if (!pblk_rb_may_write(rb, nr_entries, pos)) {
+ spin_unlock(&rb->w_lock);
+ return 0;
+ }
+
+ pblk_rl_gc_in(&pblk->rl, nr_entries);
+ spin_unlock(&rb->w_lock);
+
+ return 1;
+}
+
+/*
+ * The caller of this function must ensure that the backpointer will not
+ * overwrite the entries passed on the list.
+ */
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+ struct list_head *list,
+ unsigned int max)
+{
+ struct pblk_rb_entry *entry, *tentry;
+ struct page *page;
+ unsigned int read = 0;
+ int ret;
+
+ list_for_each_entry_safe(entry, tentry, list, index) {
+ if (read > max) {
+ pr_err("pblk: too many entries on list\n");
+ goto out;
+ }
+
+ page = virt_to_page(entry->data);
+ if (!page) {
+ pr_err("pblk: could not allocate write bio page\n");
+ goto out;
+ }
+
+ ret = bio_add_page(bio, page, rb->seg_size, 0);
+ if (ret != rb->seg_size) {
+ pr_err("pblk: could not add page to write bio\n");
+ goto out;
+ }
+
+ list_del(&entry->index);
+ read++;
+ }
+
+out:
+ return read;
+}
+
+/*
+ * Read available entries on rb and add them to the given bio. To avoid a memory
+ * copy, a page reference to the write buffer is used to be added to the bio.
+ *
+ * This function is used by the write thread to form the write bio that will
+ * persist data on the write buffer to the media.
+ */
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_c_ctx *c_ctx,
+ unsigned int pos,
+ unsigned int nr_entries,
+ unsigned int count)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_rb_entry *entry;
+ struct page *page;
+ unsigned int pad = 0, read = 0, to_read = nr_entries;
+ unsigned int user_io = 0, gc_io = 0;
+ unsigned int i;
+ int flags;
+ int ret;
+
+ if (count < nr_entries) {
+ pad = nr_entries - count;
+ to_read = count;
+ }
+
+ c_ctx->sentry = pos;
+ c_ctx->nr_valid = to_read;
+ c_ctx->nr_padded = pad;
+
+ for (i = 0; i < to_read; i++) {
+ entry = &rb->entries[pos];
+
+ /* A write has been allowed into the buffer, but data is still
+ * being copied to it. It is ok to busy wait.
+ */
+try:
+ flags = READ_ONCE(entry->w_ctx.flags);
+ if (!(flags & PBLK_WRITTEN_DATA))
+ goto try;
+
+ if (flags & PBLK_IOTYPE_USER)
+ user_io++;
+ else if (flags & PBLK_IOTYPE_GC)
+ gc_io++;
+ else
+ WARN(1, "pblk: unknown IO type\n");
+
+ page = virt_to_page(entry->data);
+ if (!page) {
+ pr_err("pblk: could not allocate write bio page\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ goto out;
+ }
+
+ ret = bio_add_page(bio, page, rb->seg_size, 0);
+ if (ret != rb->seg_size) {
+ pr_err("pblk: could not add page to write bio\n");
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+ goto out;
+ }
+
+ if (flags & PBLK_FLUSH_ENTRY) {
+ unsigned int sync_point;
+
+ sync_point = READ_ONCE(rb->sync_point);
+ if (sync_point == pos) {
+ /* Protect syncs */
+ smp_store_release(&rb->sync_point, EMPTY_ENTRY);
+ }
+
+ flags &= ~PBLK_FLUSH_ENTRY;
+#ifdef CONFIG_NVM_DEBUG
+ atomic_dec(&rb->inflight_sync_point);
+#endif
+ }
+
+ flags &= ~PBLK_WRITTEN_DATA;
+ flags |= PBLK_SUBMITTED_ENTRY;
+
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&entry->w_ctx.flags, flags);
+
+ pos = (pos + 1) & (rb->nr_entries - 1);
+ }
+
+ read = to_read;
+ pblk_rl_out(&pblk->rl, user_io, gc_io);
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(pad, &((struct pblk *)
+ (container_of(rb, struct pblk, rwb)))->padded_writes);
+#endif
+out:
+ return read;
+}
+
+/*
+ * Copy to bio only if the lba matches the one on the given cache entry.
+ * Otherwise, it means that the entry has been overwritten, and the bio should
+ * be directed to disk.
+ */
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+ u64 pos, int bio_iter)
+{
+ struct pblk_rb_entry *entry;
+ struct pblk_w_ctx *w_ctx;
+ void *data;
+ int flags;
+ int ret = 1;
+
+ spin_lock(&rb->w_lock);
+
+#ifdef CONFIG_NVM_DEBUG
+ /* Caller must ensure that the access will not cause an overflow */
+ BUG_ON(pos >= rb->nr_entries);
+#endif
+ entry = &rb->entries[pos];
+ w_ctx = &entry->w_ctx;
+ flags = READ_ONCE(w_ctx->flags);
+
+ /* Check if the entry has been overwritten or is scheduled to be */
+ if (w_ctx->lba != lba || flags & PBLK_WRITABLE_ENTRY) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Only advance the bio if it hasn't been advanced already. If advanced,
+ * this bio is at least a partial bio (i.e., it has partially been
+ * filled with data from the cache). If part of the data resides on the
+ * media, we will read later on
+ */
+ if (unlikely(!bio->bi_iter.bi_idx))
+ bio_advance(bio, bio_iter * PBLK_EXPOSED_PAGE_SIZE);
+
+ data = bio_data(bio);
+ memcpy(data, entry->data, rb->seg_size);
+
+out:
+ spin_unlock(&rb->w_lock);
+ return ret;
+}
+
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos)
+{
+ unsigned int entry = pos & (rb->nr_entries - 1);
+
+ return &rb->entries[entry].w_ctx;
+}
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags)
+ __acquires(&rb->s_lock)
+{
+ if (flags)
+ spin_lock_irqsave(&rb->s_lock, *flags);
+ else
+ spin_lock_irq(&rb->s_lock);
+
+ return rb->sync;
+}
+
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
+ __releases(&rb->s_lock)
+{
+ lockdep_assert_held(&rb->s_lock);
+
+ if (flags)
+ spin_unlock_irqrestore(&rb->s_lock, *flags);
+ else
+ spin_unlock_irq(&rb->s_lock);
+}
+
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
+{
+ unsigned int sync;
+ unsigned int i;
+
+ lockdep_assert_held(&rb->s_lock);
+
+ sync = READ_ONCE(rb->sync);
+
+ for (i = 0; i < nr_entries; i++)
+ sync = (sync + 1) & (rb->nr_entries - 1);
+
+ /* Protect from counts */
+ smp_store_release(&rb->sync, sync);
+
+ return sync;
+}
+
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
+{
+ unsigned int subm, sync_point;
+ unsigned int count;
+
+ /* Protect syncs */
+ sync_point = smp_load_acquire(&rb->sync_point);
+ if (sync_point == EMPTY_ENTRY)
+ return 0;
+
+ subm = READ_ONCE(rb->subm);
+
+ /* The sync point itself counts as a sector to sync */
+ count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+
+ return count;
+}
+
+/*
+ * Scan from the current position of the sync pointer to find the entry that
+ * corresponds to the given ppa. This is necessary since write requests can be
+ * completed out of order. The assumption is that the ppa is close to the sync
+ * pointer thus the search will not take long.
+ *
+ * The caller of this function must guarantee that the sync pointer will no
+ * reach the entry while it is using the metadata associated with it. With this
+ * assumption in mind, there is no need to take the sync lock.
+ */
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa)
+{
+ unsigned int sync, subm, count;
+ unsigned int i;
+
+ sync = READ_ONCE(rb->sync);
+ subm = READ_ONCE(rb->subm);
+ count = pblk_rb_ring_count(subm, sync, rb->nr_entries);
+
+ for (i = 0; i < count; i++)
+ sync = (sync + 1) & (rb->nr_entries - 1);
+
+ return NULL;
+}
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb)
+{
+ struct pblk_rb_entry *entry;
+ int i;
+ int ret = 0;
+
+ spin_lock(&rb->w_lock);
+ spin_lock_irq(&rb->s_lock);
+
+ if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
+ (rb->sync == rb->l2p_update) &&
+ (rb->sync_point == EMPTY_ENTRY)) {
+ goto out;
+ }
+
+ if (!rb->entries) {
+ ret = 1;
+ goto out;
+ }
+
+ for (i = 0; i < rb->nr_entries; i++) {
+ entry = &rb->entries[i];
+
+ if (!entry->data) {
+ ret = 1;
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock(&rb->w_lock);
+ spin_unlock_irq(&rb->s_lock);
+
+ return ret;
+}
+
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos)
+{
+ return (pos & (rb->nr_entries - 1));
+}
+
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos)
+{
+ return (pos >= rb->nr_entries);
+}
+
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
+{
+ struct pblk *pblk = container_of(rb, struct pblk, rwb);
+ struct pblk_c_ctx *c;
+ ssize_t offset;
+ int queued_entries = 0;
+
+ spin_lock_irq(&rb->s_lock);
+ list_for_each_entry(c, &pblk->compl_list, list)
+ queued_entries++;
+ spin_unlock_irq(&rb->s_lock);
+
+ if (rb->sync_point != EMPTY_ENTRY)
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+ atomic_read(&rb->inflight_sync_point),
+#else
+ 0,
+#endif
+ rb->sync_point,
+ pblk_rb_read_count(rb),
+ pblk_rb_space(rb),
+ pblk_rb_sync_point_count(rb),
+ queued_entries);
+ else
+ offset = scnprintf(buf, PAGE_SIZE,
+ "%u\t%u\t%u\t%u\t%u\t%u\tNULL - %u/%u/%u - %d\n",
+ rb->nr_entries,
+ rb->mem,
+ rb->subm,
+ rb->sync,
+ rb->l2p_update,
+#ifdef CONFIG_NVM_DEBUG
+ atomic_read(&rb->inflight_sync_point),
+#else
+ 0,
+#endif
+ pblk_rb_read_count(rb),
+ pblk_rb_space(rb),
+ pblk_rb_sync_point_count(rb),
+ queued_entries);
+
+ return offset;
+}
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
new file mode 100644
index 0000000000000..4a12f14d78c68
--- /dev/null
+++ b/drivers/lightnvm/pblk-read.c
@@ -0,0 +1,529 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-read.c - pblk's read path
+ */
+
+#include "pblk.h"
+
+/*
+ * There is no guarantee that the value read from cache has not been updated and
+ * resides at another location in the cache. We guarantee though that if the
+ * value is read from the cache, it belongs to the mapped lba. In order to
+ * guarantee and order between writes and reads are ordered, a flush must be
+ * issued.
+ */
+static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio,
+ sector_t lba, struct ppa_addr ppa,
+ int bio_iter)
+{
+#ifdef CONFIG_NVM_DEBUG
+ /* Callers must ensure that the ppa points to a cache address */
+ BUG_ON(pblk_ppa_empty(ppa));
+ BUG_ON(!pblk_addr_in_cache(ppa));
+#endif
+
+ return pblk_rb_copy_to_bio(&pblk->rwb, bio, lba,
+ pblk_addr_to_cacheline(ppa), bio_iter);
+}
+
+static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned long *read_bitmap)
+{
+ struct bio *bio = rqd->bio;
+ struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ sector_t blba = pblk_get_lba(bio);
+ int nr_secs = rqd->nr_ppas;
+ int advanced_bio = 0;
+ int i, j = 0;
+
+ /* logic error: lba out-of-bounds. Ignore read request */
+ if (blba + nr_secs >= pblk->rl.nr_secs) {
+ WARN(1, "pblk: read lbas out of bounds\n");
+ return;
+ }
+
+ pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ struct ppa_addr p = ppas[i];
+ sector_t lba = blba + i;
+
+retry:
+ if (pblk_ppa_empty(p)) {
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ continue;
+ }
+
+ /* Try to read from write buffer. The address is later checked
+ * on the write buffer to prevent retrieving overwritten data.
+ */
+ if (pblk_addr_in_cache(p)) {
+ if (!pblk_read_from_cache(pblk, bio, lba, p, i)) {
+ pblk_lookup_l2p_seq(pblk, &p, lba, 1);
+ goto retry;
+ }
+ WARN_ON(test_and_set_bit(i, read_bitmap));
+ advanced_bio = 1;
+ } else {
+ /* Read from media non-cached sectors */
+ rqd->ppa_list[j++] = p;
+ }
+
+ if (advanced_bio)
+ bio_advance(bio, PBLK_EXPOSED_PAGE_SIZE);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(nr_secs, &pblk->inflight_reads);
+#endif
+}
+
+static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ int err;
+
+ rqd->flags = pblk_set_read_mode(pblk);
+
+ err = pblk_submit_io(pblk, rqd);
+ if (err)
+ return NVM_IO_ERR;
+
+ return NVM_IO_OK;
+}
+
+static void pblk_end_io_read(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+ struct bio *bio = rqd->bio;
+
+ if (rqd->error)
+ pblk_log_read_err(pblk, rqd);
+#ifdef CONFIG_NVM_DEBUG
+ else
+ WARN_ONCE(bio->bi_error, "pblk: corrupted read error\n");
+#endif
+
+ if (rqd->nr_ppas > 1)
+ nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
+
+ bio_put(bio);
+ if (r_ctx->orig_bio) {
+#ifdef CONFIG_NVM_DEBUG
+ WARN_ONCE(r_ctx->orig_bio->bi_error,
+ "pblk: corrupted read bio\n");
+#endif
+ bio_endio(r_ctx->orig_bio);
+ bio_put(r_ctx->orig_bio);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(rqd->nr_ppas, &pblk->sync_reads);
+ atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads);
+#endif
+
+ pblk_free_rqd(pblk, rqd, READ);
+}
+
+static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap)
+{
+ struct bio *new_bio, *bio = rqd->bio;
+ struct bio_vec src_bv, dst_bv;
+ void *ppa_ptr = NULL;
+ void *src_p, *dst_p;
+ dma_addr_t dma_ppa_list = 0;
+ int nr_secs = rqd->nr_ppas;
+ int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
+ int i, ret, hole;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ new_bio = bio_alloc(GFP_KERNEL, nr_holes);
+ if (!new_bio) {
+ pr_err("pblk: could not alloc read bio\n");
+ return NVM_IO_ERR;
+ }
+
+ if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
+ goto err;
+
+ if (nr_holes != new_bio->bi_vcnt) {
+ pr_err("pblk: malformed bio\n");
+ goto err;
+ }
+
+ new_bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
+ new_bio->bi_private = &wait;
+ new_bio->bi_end_io = pblk_end_bio_sync;
+
+ rqd->bio = new_bio;
+ rqd->nr_ppas = nr_holes;
+ rqd->end_io = NULL;
+
+ if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ ppa_ptr = rqd->ppa_list;
+ dma_ppa_list = rqd->dma_ppa_list;
+ rqd->ppa_addr = rqd->ppa_list[0];
+ }
+
+ ret = pblk_submit_read_io(pblk, rqd);
+ if (ret) {
+ bio_put(rqd->bio);
+ pr_err("pblk: read IO submission failed\n");
+ goto err;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: partial read I/O timed out\n");
+ }
+
+ if (rqd->error) {
+ atomic_long_inc(&pblk->read_failed);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, rqd, rqd->error);
+#endif
+ }
+
+ if (unlikely(nr_secs > 1 && nr_holes == 1)) {
+ rqd->ppa_list = ppa_ptr;
+ rqd->dma_ppa_list = dma_ppa_list;
+ }
+
+ /* Fill the holes in the original bio */
+ i = 0;
+ hole = find_first_zero_bit(read_bitmap, nr_secs);
+ do {
+ src_bv = new_bio->bi_io_vec[i++];
+ dst_bv = bio->bi_io_vec[bio_init_idx + hole];
+
+ src_p = kmap_atomic(src_bv.bv_page);
+ dst_p = kmap_atomic(dst_bv.bv_page);
+
+ memcpy(dst_p + dst_bv.bv_offset,
+ src_p + src_bv.bv_offset,
+ PBLK_EXPOSED_PAGE_SIZE);
+
+ kunmap_atomic(src_p);
+ kunmap_atomic(dst_p);
+
+ mempool_free(src_bv.bv_page, pblk->page_pool);
+
+ hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1);
+ } while (hole < nr_secs);
+
+ bio_put(new_bio);
+
+ /* Complete the original bio and associated request */
+ rqd->bio = bio;
+ rqd->nr_ppas = nr_secs;
+ rqd->private = pblk;
+
+ bio_endio(bio);
+ pblk_end_io_read(rqd);
+ return NVM_IO_OK;
+
+err:
+ /* Free allocated pages in new bio */
+ pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
+ rqd->private = pblk;
+ pblk_end_io_read(rqd);
+ return NVM_IO_ERR;
+}
+
+static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned long *read_bitmap)
+{
+ struct bio *bio = rqd->bio;
+ struct ppa_addr ppa;
+ sector_t lba = pblk_get_lba(bio);
+
+ /* logic error: lba out-of-bounds. Ignore read request */
+ if (lba >= pblk->rl.nr_secs) {
+ WARN(1, "pblk: read lba out of bounds\n");
+ return;
+ }
+
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+retry:
+ if (pblk_ppa_empty(ppa)) {
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ return;
+ }
+
+ /* Try to read from write buffer. The address is later checked on the
+ * write buffer to prevent retrieving overwritten data.
+ */
+ if (pblk_addr_in_cache(ppa)) {
+ if (!pblk_read_from_cache(pblk, bio, lba, ppa, 0)) {
+ pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
+ goto retry;
+ }
+ WARN_ON(test_and_set_bit(0, read_bitmap));
+ } else {
+ rqd->ppa_addr = ppa;
+ }
+}
+
+int pblk_submit_read(struct pblk *pblk, struct bio *bio)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ unsigned int nr_secs = pblk_get_secs(bio);
+ struct nvm_rq *rqd;
+ unsigned long read_bitmap; /* Max 64 ppas per request */
+ unsigned int bio_init_idx;
+ int ret = NVM_IO_ERR;
+
+ if (nr_secs > PBLK_MAX_REQ_ADDRS)
+ return NVM_IO_ERR;
+
+ bitmap_zero(&read_bitmap, nr_secs);
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd)) {
+ pr_err_ratelimited("pblk: not able to alloc rqd");
+ return NVM_IO_ERR;
+ }
+
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->bio = bio;
+ rqd->nr_ppas = nr_secs;
+ rqd->private = pblk;
+ rqd->end_io = pblk_end_io_read;
+
+ /* Save the index for this bio's start. This is needed in case
+ * we need to fill a partial read.
+ */
+ bio_init_idx = pblk_get_bi_idx(bio);
+
+ if (nr_secs > 1) {
+ rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_ppa_list);
+ if (!rqd->ppa_list) {
+ pr_err("pblk: not able to allocate ppa list\n");
+ goto fail_rqd_free;
+ }
+
+ pblk_read_ppalist_rq(pblk, rqd, &read_bitmap);
+ } else {
+ pblk_read_rq(pblk, rqd, &read_bitmap);
+ }
+
+ bio_get(bio);
+ if (bitmap_full(&read_bitmap, nr_secs)) {
+ bio_endio(bio);
+ pblk_end_io_read(rqd);
+ return NVM_IO_OK;
+ }
+
+ /* All sectors are to be read from the device */
+ if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) {
+ struct bio *int_bio = NULL;
+ struct pblk_r_ctx *r_ctx = nvm_rq_to_pdu(rqd);
+
+ /* Clone read bio to deal with read errors internally */
+ int_bio = bio_clone_bioset(bio, GFP_KERNEL, fs_bio_set);
+ if (!int_bio) {
+ pr_err("pblk: could not clone read bio\n");
+ return NVM_IO_ERR;
+ }
+
+ rqd->bio = int_bio;
+ r_ctx->orig_bio = bio;
+
+ ret = pblk_submit_read_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: read IO submission failed\n");
+ if (int_bio)
+ bio_put(int_bio);
+ return ret;
+ }
+
+ return NVM_IO_OK;
+ }
+
+ /* The read bio request could be partially filled by the write buffer,
+ * but there are some holes that need to be read from the drive.
+ */
+ ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
+ if (ret) {
+ pr_err("pblk: failed to perform partial read\n");
+ return ret;
+ }
+
+ return NVM_IO_OK;
+
+fail_rqd_free:
+ pblk_free_rqd(pblk, rqd, READ);
+ return ret;
+}
+
+static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_line *line, u64 *lba_list,
+ unsigned int nr_secs)
+{
+ struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS];
+ int valid_secs = 0;
+ int i;
+
+ pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs);
+
+ for (i = 0; i < nr_secs; i++) {
+ if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id ||
+ pblk_ppa_empty(ppas[i])) {
+ lba_list[i] = ADDR_EMPTY;
+ continue;
+ }
+
+ rqd->ppa_list[valid_secs++] = ppas[i];
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(valid_secs, &pblk->inflight_reads);
+#endif
+ return valid_secs;
+}
+
+static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_line *line, sector_t lba)
+{
+ struct ppa_addr ppa;
+ int valid_secs = 0;
+
+ if (lba == ADDR_EMPTY)
+ goto out;
+
+ /* logic error: lba out-of-bounds */
+ if (lba >= pblk->rl.nr_secs) {
+ WARN(1, "pblk: read lba out of bounds\n");
+ goto out;
+ }
+
+ spin_lock(&pblk->trans_lock);
+ ppa = pblk_trans_map_get(pblk, lba);
+ spin_unlock(&pblk->trans_lock);
+
+ /* Ignore updated values until the moment */
+ if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id ||
+ pblk_ppa_empty(ppa))
+ goto out;
+
+ rqd->ppa_addr = ppa;
+ valid_secs = 1;
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->inflight_reads);
+#endif
+
+out:
+ return valid_secs;
+}
+
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+ unsigned int nr_secs, unsigned int *secs_to_gc,
+ struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct request_queue *q = dev->q;
+ struct bio *bio;
+ struct nvm_rq rqd;
+ int ret, data_len;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ memset(&rqd, 0, sizeof(struct nvm_rq));
+
+ if (nr_secs > 1) {
+ rqd.ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd.dma_ppa_list);
+ if (!rqd.ppa_list)
+ return NVM_IO_ERR;
+
+ *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list,
+ nr_secs);
+ if (*secs_to_gc == 1) {
+ struct ppa_addr ppa;
+
+ ppa = rqd.ppa_list[0];
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list,
+ rqd.dma_ppa_list);
+ rqd.ppa_addr = ppa;
+ }
+ } else {
+ *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]);
+ }
+
+ if (!(*secs_to_gc))
+ goto out;
+
+ data_len = (*secs_to_gc) * geo->sec_size;
+ bio = bio_map_kern(q, data, data_len, GFP_KERNEL);
+ if (IS_ERR(bio)) {
+ pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio));
+ goto err_free_dma;
+ }
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd.opcode = NVM_OP_PREAD;
+ rqd.end_io = pblk_end_io_sync;
+ rqd.private = &wait;
+ rqd.nr_ppas = *secs_to_gc;
+ rqd.bio = bio;
+
+ ret = pblk_submit_read_io(pblk, &rqd);
+ if (ret) {
+ bio_endio(bio);
+ pr_err("pblk: GC read request failed\n");
+ goto err_free_dma;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: GC read I/O timed out\n");
+ }
+
+ if (rqd.error) {
+ atomic_long_inc(&pblk->read_failed_gc);
+#ifdef CONFIG_NVM_DEBUG
+ pblk_print_failed_rqd(pblk, &rqd, rqd.error);
+#endif
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(*secs_to_gc, &pblk->sync_reads);
+ atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads);
+ atomic_long_sub(*secs_to_gc, &pblk->inflight_reads);
+#endif
+
+out:
+ if (rqd.nr_ppas > 1)
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+ return NVM_IO_OK;
+
+err_free_dma:
+ if (rqd.nr_ppas > 1)
+ nvm_dev_dma_free(dev->parent, rqd.ppa_list, rqd.dma_ppa_list);
+ return NVM_IO_ERR;
+}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
new file mode 100644
index 0000000000000..f8f85087cd3c2
--- /dev/null
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -0,0 +1,998 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-recovery.c - pblk's recovery path
+ */
+
+#include "pblk.h"
+
+void pblk_submit_rec(struct work_struct *work)
+{
+ struct pblk_rec_ctx *recovery =
+ container_of(work, struct pblk_rec_ctx, ws_rec);
+ struct pblk *pblk = recovery->pblk;
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_rq *rqd = recovery->rqd;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ int max_secs = nvm_max_phys_sects(dev);
+ struct bio *bio;
+ unsigned int nr_rec_secs;
+ unsigned int pgs_read;
+ int ret;
+
+ nr_rec_secs = bitmap_weight((unsigned long int *)&rqd->ppa_status,
+ max_secs);
+
+ bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
+ if (!bio) {
+ pr_err("pblk: not able to create recovery bio\n");
+ return;
+ }
+
+ bio->bi_iter.bi_sector = 0;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd->bio = bio;
+ rqd->nr_ppas = nr_rec_secs;
+
+ pgs_read = pblk_rb_read_to_bio_list(&pblk->rwb, bio, &recovery->failed,
+ nr_rec_secs);
+ if (pgs_read != nr_rec_secs) {
+ pr_err("pblk: could not read recovery entries\n");
+ goto err;
+ }
+
+ if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
+ pr_err("pblk: could not setup recovery request\n");
+ goto err;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(nr_rec_secs, &pblk->recov_writes);
+#endif
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ goto err;
+ }
+
+ mempool_free(recovery, pblk->rec_pool);
+ return;
+
+err:
+ bio_put(bio);
+ pblk_free_rqd(pblk, rqd, WRITE);
+}
+
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+ struct pblk_rec_ctx *recovery, u64 *comp_bits,
+ unsigned int comp)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ int max_secs = nvm_max_phys_sects(dev);
+ struct nvm_rq *rec_rqd;
+ struct pblk_c_ctx *rec_ctx;
+ int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
+
+ rec_rqd = pblk_alloc_rqd(pblk, WRITE);
+ if (IS_ERR(rec_rqd)) {
+ pr_err("pblk: could not create recovery req.\n");
+ return -ENOMEM;
+ }
+
+ rec_ctx = nvm_rq_to_pdu(rec_rqd);
+
+ /* Copy completion bitmap, but exclude the first X completed entries */
+ bitmap_shift_right((unsigned long int *)&rec_rqd->ppa_status,
+ (unsigned long int *)comp_bits,
+ comp, max_secs);
+
+ /* Save the context for the entries that need to be re-written and
+ * update current context with the completed entries.
+ */
+ rec_ctx->sentry = pblk_rb_wrap_pos(&pblk->rwb, c_ctx->sentry + comp);
+ if (comp >= c_ctx->nr_valid) {
+ rec_ctx->nr_valid = 0;
+ rec_ctx->nr_padded = nr_entries - comp;
+
+ c_ctx->nr_padded = comp - c_ctx->nr_valid;
+ } else {
+ rec_ctx->nr_valid = c_ctx->nr_valid - comp;
+ rec_ctx->nr_padded = c_ctx->nr_padded;
+
+ c_ctx->nr_valid = comp;
+ c_ctx->nr_padded = 0;
+ }
+
+ recovery->rqd = rec_rqd;
+ recovery->pblk = pblk;
+
+ return 0;
+}
+
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta)
+{
+ u32 crc;
+
+ crc = pblk_calc_emeta_crc(pblk, emeta);
+ if (le32_to_cpu(emeta->crc) != crc)
+ return NULL;
+
+ if (le32_to_cpu(emeta->header.identifier) != PBLK_MAGIC)
+ return NULL;
+
+ return pblk_line_emeta_to_lbas(emeta);
+}
+
+static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct line_emeta *emeta = line->emeta;
+ __le64 *lba_list;
+ int data_start;
+ int nr_data_lbas, nr_valid_lbas, nr_lbas = 0;
+ int i;
+
+ lba_list = pblk_recov_get_lba_list(pblk, emeta);
+ if (!lba_list)
+ return 1;
+
+ data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
+ nr_data_lbas = lm->sec_per_line - lm->emeta_sec;
+ nr_valid_lbas = le64_to_cpu(emeta->nr_valid_lbas);
+
+ for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) {
+ struct ppa_addr ppa;
+ int pos;
+
+ ppa = addr_to_pblk_ppa(pblk, i, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ /* Do not update bad blocks */
+ if (test_bit(pos, line->blk_bitmap))
+ continue;
+
+ if (le64_to_cpu(lba_list[i]) == ADDR_EMPTY) {
+ spin_lock(&line->lock);
+ if (test_and_set_bit(i, line->invalid_bitmap))
+ WARN_ONCE(1, "pblk: rec. double invalidate:\n");
+ else
+ line->vsc--;
+ spin_unlock(&line->lock);
+
+ continue;
+ }
+
+ pblk_update_map(pblk, le64_to_cpu(lba_list[i]), ppa);
+ nr_lbas++;
+ }
+
+ if (nr_valid_lbas != nr_lbas)
+ pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n",
+ line->id, line->emeta->nr_valid_lbas, nr_lbas);
+
+ line->left_msecs = 0;
+
+ return 0;
+}
+
+static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+
+ return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec -
+ nr_bb * geo->sec_per_blk;
+}
+
+struct pblk_recov_alloc {
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ void *data;
+ dma_addr_t dma_ppa_list;
+ dma_addr_t dma_meta_list;
+};
+
+static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, u64 r_ptr)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 r_ptr_int;
+ int left_ppas;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ left_ppas = line->cur_sec - r_ptr;
+ if (!left_ppas)
+ return 0;
+
+ r_ptr_int = r_ptr;
+
+next_read_rq:
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->flags = pblk_set_read_mode(pblk);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ r_ptr_int += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, r_ptr_int, line->id);
+ }
+
+ /* If read fails, more padding is needed */
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery read timed out\n");
+ return -EINTR;
+ }
+
+ reinit_completion(&wait);
+
+ /* At this point, the read should not fail. If it does, it is a problem
+ * we cannot recover from here. Need FTL log.
+ */
+ if (rqd->error) {
+ pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
+ return -EINTR;
+ }
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_read_rq;
+
+ return 0;
+}
+
+static int pblk_recov_pad_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, int left_ppas)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ __le64 *lba_list = pblk_line_emeta_to_lbas(line->emeta);
+ u64 w_ptr = line->cur_sec;
+ int left_line_ppas = line->left_msecs;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+next_pad_rq:
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ w_ptr += pblk->min_write_pgs;
+ ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+ pos = pblk_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
+ struct ppa_addr dev_ppa;
+
+ dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+
+ pblk_map_invalidate(pblk, dev_ppa);
+ meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+ lba_list[w_ptr] = cpu_to_le64(ADDR_EMPTY);
+ rqd->ppa_list[i] = dev_ppa;
+ }
+ }
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery write timed out\n");
+ }
+ reinit_completion(&wait);
+
+ left_line_ppas -= rq_ppas;
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0 && left_line_ppas)
+ goto next_pad_rq;
+
+ return 0;
+}
+
+/* When this function is called, it means that not all upper pages have been
+ * written in a page that contains valid data. In order to recover this data, we
+ * first find the write pointer on the device, then we pad all necessary
+ * sectors, and finally attempt to read the valid data
+ */
+static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 w_ptr = 0, r_ptr;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ int rec_round;
+ int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ /* we could recover up until the line write pointer */
+ r_ptr = line->cur_sec;
+ rec_round = 0;
+
+next_rq:
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->flags = pblk_set_read_mode(pblk);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ w_ptr += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, w_ptr, line->id);
+ }
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery read timed out\n");
+ }
+ reinit_completion(&wait);
+
+ /* This should not happen since the read failed during normal recovery,
+ * but the media works funny sometimes...
+ */
+ if (!rec_round++ && !rqd->error) {
+ rec_round = 0;
+ for (i = 0; i < rqd->nr_ppas; i++, r_ptr++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+ }
+
+ /* Reached the end of the written line */
+ if (rqd->error == NVM_RSP_ERR_EMPTYPAGE) {
+ int pad_secs, nr_error_bits, bit;
+ int ret;
+
+ bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+ nr_error_bits = rqd->nr_ppas - bit;
+
+ /* Roll back failed sectors */
+ line->cur_sec -= nr_error_bits;
+ line->left_msecs += nr_error_bits;
+ bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+ pad_secs = pblk_pad_distance(pblk);
+ if (pad_secs > line->left_msecs)
+ pad_secs = line->left_msecs;
+
+ ret = pblk_recov_pad_oob(pblk, line, p, pad_secs);
+ if (ret)
+ pr_err("pblk: OOB padding failed (err:%d)\n", ret);
+
+ ret = pblk_recov_read_oob(pblk, line, p, r_ptr);
+ if (ret)
+ pr_err("pblk: OOB read failed (err:%d)\n", ret);
+
+ line->left_ssecs = line->left_msecs;
+ left_ppas = 0;
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_rq;
+
+ return ret;
+}
+
+static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
+ struct pblk_recov_alloc p, int *done)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct nvm_rq *rqd;
+ struct bio *bio;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ u64 paddr;
+ int rq_ppas, rq_len;
+ int i, j;
+ int ret = 0;
+ int left_ppas = pblk_calc_sec_in_line(pblk, line);
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ ppa_list = p.ppa_list;
+ meta_list = p.meta_list;
+ rqd = p.rqd;
+ data = p.data;
+ dma_ppa_list = p.dma_ppa_list;
+ dma_meta_list = p.dma_meta_list;
+
+ *done = 1;
+
+next_rq:
+ memset(rqd, 0, pblk_r_rq_size);
+
+ rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+ if (!rq_ppas)
+ rq_ppas = pblk->min_write_pgs;
+ rq_len = rq_ppas * geo->sec_size;
+
+ bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
+
+ rqd->bio = bio;
+ rqd->opcode = NVM_OP_PREAD;
+ rqd->flags = pblk_set_read_mode(pblk);
+ rqd->meta_list = meta_list;
+ rqd->nr_ppas = rq_ppas;
+ rqd->ppa_list = ppa_list;
+ rqd->dma_ppa_list = dma_ppa_list;
+ rqd->dma_meta_list = dma_meta_list;
+ rqd->end_io = pblk_end_io_sync;
+ rqd->private = &wait;
+
+ for (i = 0; i < rqd->nr_ppas; ) {
+ struct ppa_addr ppa;
+ int pos;
+
+ paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
+ ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+
+ while (test_bit(pos, line->blk_bitmap)) {
+ paddr += pblk->min_write_pgs;
+ ppa = addr_to_gen_ppa(pblk, paddr, line->id);
+ pos = pblk_dev_ppa_to_pos(geo, ppa);
+ }
+
+ for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
+ rqd->ppa_list[i] =
+ addr_to_gen_ppa(pblk, paddr, line->id);
+ }
+
+ ret = pblk_submit_io(pblk, rqd);
+ if (ret) {
+ pr_err("pblk: I/O submission failed: %d\n", ret);
+ bio_put(bio);
+ return ret;
+ }
+
+ if (!wait_for_completion_io_timeout(&wait,
+ msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) {
+ pr_err("pblk: L2P recovery read timed out\n");
+ }
+ reinit_completion(&wait);
+
+ /* Reached the end of the written line */
+ if (rqd->error) {
+ int nr_error_bits, bit;
+
+ bit = find_first_bit((void *)&rqd->ppa_status, rqd->nr_ppas);
+ nr_error_bits = rqd->nr_ppas - bit;
+
+ /* Roll back failed sectors */
+ line->cur_sec -= nr_error_bits;
+ line->left_msecs += nr_error_bits;
+ line->left_ssecs = line->left_msecs;
+ bitmap_clear(line->map_bitmap, line->cur_sec, nr_error_bits);
+
+ left_ppas = 0;
+ rqd->nr_ppas = bit;
+
+ if (rqd->error != NVM_RSP_ERR_EMPTYPAGE)
+ *done = 0;
+ }
+
+ for (i = 0; i < rqd->nr_ppas; i++) {
+ u64 lba = le64_to_cpu(meta_list[i].lba);
+
+ if (lba == ADDR_EMPTY || lba > pblk->rl.nr_secs)
+ continue;
+
+ pblk_update_map(pblk, lba, rqd->ppa_list[i]);
+ }
+
+ left_ppas -= rq_ppas;
+ if (left_ppas > 0)
+ goto next_rq;
+
+ return ret;
+}
+
+/* Scan line for lbas on out of bound area */
+static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct nvm_rq *rqd;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ struct pblk_recov_alloc p;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+ int done, ret = 0;
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd))
+ return PTR_ERR(rqd);
+
+ meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+ if (!meta_list) {
+ ret = -ENOMEM;
+ goto free_rqd;
+ }
+
+ ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+ data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+ if (!data) {
+ ret = -ENOMEM;
+ goto free_meta_list;
+ }
+
+ p.ppa_list = ppa_list;
+ p.meta_list = meta_list;
+ p.rqd = rqd;
+ p.data = data;
+ p.dma_ppa_list = dma_ppa_list;
+ p.dma_meta_list = dma_meta_list;
+
+ ret = pblk_recov_scan_oob(pblk, line, p, &done);
+ if (ret) {
+ pr_err("pblk: could not recover L2P from OOB\n");
+ goto out;
+ }
+
+ if (!done) {
+ ret = pblk_recov_scan_all_oob(pblk, line, p);
+ if (ret) {
+ pr_err("pblk: could not recover L2P from OOB\n");
+ goto out;
+ }
+ }
+
+ if (pblk_line_is_full(line))
+ pblk_line_recov_close(pblk, line);
+
+out:
+ kfree(data);
+free_meta_list:
+ nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+ pblk_free_rqd(pblk, rqd, READ);
+
+ return ret;
+}
+
+/* Insert lines ordered by sequence number (seq_num) on list */
+static void pblk_recov_line_add_ordered(struct list_head *head,
+ struct pblk_line *line)
+{
+ struct pblk_line *t = NULL;
+
+ list_for_each_entry(t, head, list)
+ if (t->seq_nr > line->seq_nr)
+ break;
+
+ __list_add(&line->list, t->list.prev, &t->list);
+}
+
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line, *tline, *data_line = NULL;
+ struct line_smeta *smeta;
+ struct line_emeta *emeta;
+ int found_lines = 0, recovered_lines = 0, open_lines = 0;
+ int is_next = 0;
+ int meta_line;
+ int i, valid_uuid = 0;
+ LIST_HEAD(recov_list);
+
+ /* TODO: Implement FTL snapshot */
+
+ /* Scan recovery - takes place when FTL snapshot fails */
+ spin_lock(&l_mg->free_lock);
+ meta_line = find_first_zero_bit(&l_mg->meta_bitmap, PBLK_DATA_LINES);
+ set_bit(meta_line, &l_mg->meta_bitmap);
+ smeta = l_mg->sline_meta[meta_line].meta;
+ emeta = l_mg->eline_meta[meta_line].meta;
+ spin_unlock(&l_mg->free_lock);
+
+ /* Order data lines using their sequence number */
+ for (i = 0; i < l_mg->nr_lines; i++) {
+ u32 crc;
+
+ line = &pblk->lines[i];
+
+ memset(smeta, 0, lm->smeta_len);
+ line->smeta = smeta;
+ line->lun_bitmap = ((void *)(smeta)) +
+ sizeof(struct line_smeta);
+
+ /* Lines that cannot be read are assumed as not written here */
+ if (pblk_line_read_smeta(pblk, line))
+ continue;
+
+ crc = pblk_calc_smeta_crc(pblk, smeta);
+ if (le32_to_cpu(smeta->crc) != crc)
+ continue;
+
+ if (le32_to_cpu(smeta->header.identifier) != PBLK_MAGIC)
+ continue;
+
+ if (le16_to_cpu(smeta->header.version) != 1) {
+ pr_err("pblk: found incompatible line version %u\n",
+ smeta->header.version);
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* The first valid instance uuid is used for initialization */
+ if (!valid_uuid) {
+ memcpy(pblk->instance_uuid, smeta->header.uuid, 16);
+ valid_uuid = 1;
+ }
+
+ if (memcmp(pblk->instance_uuid, smeta->header.uuid, 16)) {
+ pr_debug("pblk: ignore line %u due to uuid mismatch\n",
+ i);
+ continue;
+ }
+
+ /* Update line metadata */
+ spin_lock(&line->lock);
+ line->id = le32_to_cpu(line->smeta->header.id);
+ line->type = le16_to_cpu(line->smeta->header.type);
+ line->seq_nr = le64_to_cpu(line->smeta->seq_nr);
+ spin_unlock(&line->lock);
+
+ /* Update general metadata */
+ spin_lock(&l_mg->free_lock);
+ if (line->seq_nr >= l_mg->d_seq_nr)
+ l_mg->d_seq_nr = line->seq_nr + 1;
+ l_mg->nr_free_lines--;
+ spin_unlock(&l_mg->free_lock);
+
+ if (pblk_line_recov_alloc(pblk, line))
+ goto out;
+
+ pblk_recov_line_add_ordered(&recov_list, line);
+ found_lines++;
+ pr_debug("pblk: recovering data line %d, seq:%llu\n",
+ line->id, smeta->seq_nr);
+ }
+
+ if (!found_lines) {
+ pblk_setup_uuid(pblk);
+
+ spin_lock(&l_mg->free_lock);
+ WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+ &l_mg->meta_bitmap));
+ spin_unlock(&l_mg->free_lock);
+
+ goto out;
+ }
+
+ /* Verify closed blocks and recover this portion of L2P table*/
+ list_for_each_entry_safe(line, tline, &recov_list, list) {
+ int off, nr_bb;
+
+ recovered_lines++;
+ /* Calculate where emeta starts based on the line bb */
+ off = lm->sec_per_line - lm->emeta_sec;
+ nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
+ off -= nr_bb * geo->sec_per_pl;
+
+ memset(emeta, 0, lm->emeta_len);
+ line->emeta = emeta;
+ line->emeta_ssec = off;
+
+ if (pblk_line_read_emeta(pblk, line)) {
+ pblk_recov_l2p_from_oob(pblk, line);
+ goto next;
+ }
+
+ if (pblk_recov_l2p_from_emeta(pblk, line))
+ pblk_recov_l2p_from_oob(pblk, line);
+
+next:
+ if (pblk_line_is_full(line)) {
+ struct list_head *move_list;
+
+ spin_lock(&line->lock);
+ line->state = PBLK_LINESTATE_CLOSED;
+ move_list = pblk_line_gc_list(pblk, line);
+ spin_unlock(&line->lock);
+
+ spin_lock(&l_mg->gc_lock);
+ list_move_tail(&line->list, move_list);
+ spin_unlock(&l_mg->gc_lock);
+
+ mempool_free(line->map_bitmap, pblk->line_meta_pool);
+ line->map_bitmap = NULL;
+ line->smeta = NULL;
+ line->emeta = NULL;
+ } else {
+ if (open_lines > 1)
+ pr_err("pblk: failed to recover L2P\n");
+
+ open_lines++;
+ line->meta_line = meta_line;
+ data_line = line;
+ }
+ }
+
+ spin_lock(&l_mg->free_lock);
+ if (!open_lines) {
+ WARN_ON_ONCE(!test_and_clear_bit(meta_line,
+ &l_mg->meta_bitmap));
+ pblk_line_replace_data(pblk);
+ } else {
+ /* Allocate next line for preparation */
+ l_mg->data_next = pblk_line_get(pblk);
+ if (l_mg->data_next) {
+ l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
+ l_mg->data_next->type = PBLK_LINETYPE_DATA;
+ is_next = 1;
+ }
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ if (is_next) {
+ pblk_line_erase(pblk, l_mg->data_next);
+ pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
+ }
+
+out:
+ if (found_lines != recovered_lines)
+ pr_err("pblk: failed to recover all found lines %d/%d\n",
+ found_lines, recovered_lines);
+
+ return data_line;
+}
+
+/*
+ * Pad until smeta can be read on current data line
+ */
+void pblk_recov_pad(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line *line;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct nvm_rq *rqd;
+ struct pblk_recov_alloc p;
+ struct ppa_addr *ppa_list;
+ struct pblk_sec_meta *meta_list;
+ void *data;
+ dma_addr_t dma_ppa_list, dma_meta_list;
+
+ spin_lock(&l_mg->free_lock);
+ line = l_mg->data_line;
+ spin_unlock(&l_mg->free_lock);
+
+ rqd = pblk_alloc_rqd(pblk, READ);
+ if (IS_ERR(rqd))
+ return;
+
+ meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list);
+ if (!meta_list)
+ goto free_rqd;
+
+ ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
+ dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+
+ data = kcalloc(pblk->max_write_pgs, geo->sec_size, GFP_KERNEL);
+ if (!data)
+ goto free_meta_list;
+
+ p.ppa_list = ppa_list;
+ p.meta_list = meta_list;
+ p.rqd = rqd;
+ p.data = data;
+ p.dma_ppa_list = dma_ppa_list;
+ p.dma_meta_list = dma_meta_list;
+
+ if (pblk_recov_pad_oob(pblk, line, p, line->left_msecs)) {
+ pr_err("pblk: Tear down padding failed\n");
+ goto free_data;
+ }
+
+ pblk_line_close(pblk, line);
+
+free_data:
+ kfree(data);
+free_meta_list:
+ nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list);
+free_rqd:
+ pblk_free_rqd(pblk, rqd, READ);
+}
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
new file mode 100644
index 0000000000000..ab7cbb144f3fc
--- /dev/null
+++ b/drivers/lightnvm/pblk-rl.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-rl.c - pblk's rate limiter for user I/O
+ *
+ */
+
+#include "pblk.h"
+
+static void pblk_rl_kick_u_timer(struct pblk_rl *rl)
+{
+ mod_timer(&rl->u_timer, jiffies + msecs_to_jiffies(5000));
+}
+
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+ int rb_user_cnt = atomic_read(&rl->rb_user_cnt);
+
+ return (!(rb_user_cnt + nr_entries > rl->rb_user_max));
+}
+
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries)
+{
+ int rb_gc_cnt = atomic_read(&rl->rb_gc_cnt);
+ int rb_user_active;
+
+ /* If there is no user I/O let GC take over space on the write buffer */
+ rb_user_active = READ_ONCE(rl->rb_user_active);
+ return (!(rb_gc_cnt + nr_entries > rl->rb_gc_max && rb_user_active));
+}
+
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries)
+{
+ atomic_add(nr_entries, &rl->rb_user_cnt);
+
+ /* Release user I/O state. Protect from GC */
+ smp_store_release(&rl->rb_user_active, 1);
+ pblk_rl_kick_u_timer(rl);
+}
+
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries)
+{
+ atomic_add(nr_entries, &rl->rb_gc_cnt);
+}
+
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc)
+{
+ atomic_sub(nr_user, &rl->rb_user_cnt);
+ atomic_sub(nr_gc, &rl->rb_gc_cnt);
+}
+
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
+{
+ return atomic_read(&rl->free_blocks);
+}
+
+/*
+ * We check for (i) the number of free blocks in the current LUN and (ii) the
+ * total number of free blocks in the pblk instance. This is to even out the
+ * number of free blocks on each LUN when GC kicks in.
+ *
+ * Only the total number of free blocks is used to configure the rate limiter.
+ */
+static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max)
+{
+ unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
+
+ if (free_blocks >= rl->high) {
+ rl->rb_user_max = max - rl->rb_gc_rsv;
+ rl->rb_gc_max = rl->rb_gc_rsv;
+ rl->rb_state = PBLK_RL_HIGH;
+ } else if (free_blocks < rl->high) {
+ int shift = rl->high_pw - rl->rb_windows_pw;
+ int user_windows = free_blocks >> shift;
+ int user_max = user_windows << PBLK_MAX_REQ_ADDRS_PW;
+ int gc_max;
+
+ rl->rb_user_max = user_max;
+ gc_max = max - rl->rb_user_max;
+ rl->rb_gc_max = max(gc_max, rl->rb_gc_rsv);
+
+ if (free_blocks > rl->low)
+ rl->rb_state = PBLK_RL_MID;
+ else
+ rl->rb_state = PBLK_RL_LOW;
+ }
+
+ return rl->rb_state;
+}
+
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv)
+{
+ rl->rb_gc_rsv = rl->rb_gc_max = rsv;
+}
+
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
+{
+ struct pblk *pblk = container_of(rl, struct pblk, rl);
+ int blk_in_line = atomic_read(&line->blk_in_line);
+ int ret;
+
+ atomic_add(blk_in_line, &rl->free_blocks);
+ /* Rates will not change that often - no need to lock update */
+ ret = pblk_rl_update_rates(rl, rl->rb_budget);
+
+ if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
+}
+
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
+{
+ struct pblk *pblk = container_of(rl, struct pblk, rl);
+ int blk_in_line = atomic_read(&line->blk_in_line);
+ int ret;
+
+ atomic_sub(blk_in_line, &rl->free_blocks);
+
+ /* Rates will not change that often - no need to lock update */
+ ret = pblk_rl_update_rates(rl, rl->rb_budget);
+ if (ret == (PBLK_RL_MID | PBLK_RL_LOW))
+ pblk_gc_should_start(pblk);
+ else
+ pblk_gc_should_stop(pblk);
+}
+
+int pblk_rl_gc_thrs(struct pblk_rl *rl)
+{
+ return rl->high;
+}
+
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl)
+{
+ return rl->rb_user_max;
+}
+
+static void pblk_rl_u_timer(unsigned long data)
+{
+ struct pblk_rl *rl = (struct pblk_rl *)data;
+
+ /* Release user I/O state. Protect from GC */
+ smp_store_release(&rl->rb_user_active, 0);
+}
+
+void pblk_rl_free(struct pblk_rl *rl)
+{
+ del_timer(&rl->u_timer);
+}
+
+void pblk_rl_init(struct pblk_rl *rl, int budget)
+{
+ unsigned int rb_windows;
+
+ rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
+ rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
+ rl->high_pw = get_count_order(rl->high);
+
+ /* This will always be a power-of-2 */
+ rb_windows = budget / PBLK_MAX_REQ_ADDRS;
+ rl->rb_windows_pw = get_count_order(rb_windows) + 1;
+
+ /* To start with, all buffer is available to user I/O writers */
+ rl->rb_budget = budget;
+ rl->rb_user_max = budget;
+ atomic_set(&rl->rb_user_cnt, 0);
+ rl->rb_gc_max = 0;
+ rl->rb_state = PBLK_RL_HIGH;
+ atomic_set(&rl->rb_gc_cnt, 0);
+
+ setup_timer(&rl->u_timer, pblk_rl_u_timer, (unsigned long)rl);
+ rl->rb_user_active = 0;
+}
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
new file mode 100644
index 0000000000000..f0af1d1ceeff1
--- /dev/null
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -0,0 +1,507 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a physical block-device target for Open-channel SSDs.
+ *
+ * pblk-sysfs.c - pblk's sysfs
+ *
+ */
+
+#include "pblk.h"
+
+static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_lun *rlun;
+ ssize_t sz = 0;
+ int i;
+
+ for (i = 0; i < geo->nr_luns; i++) {
+ int active = 1;
+
+ rlun = &pblk->luns[i];
+ if (!down_trylock(&rlun->wr_sem)) {
+ active = 0;
+ up(&rlun->wr_sem);
+ }
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "pblk: pos:%d, ch:%d, lun:%d - %d\n",
+ i,
+ rlun->bppa.g.ch,
+ rlun->bppa.g.lun,
+ active);
+ }
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int free_blocks, total_blocks;
+ int rb_user_max, rb_user_cnt;
+ int rb_gc_max, rb_gc_rsv, rb_gc_cnt, rb_budget, rb_state;
+
+ free_blocks = atomic_read(&pblk->rl.free_blocks);
+ rb_user_max = pblk->rl.rb_user_max;
+ rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
+ rb_gc_max = pblk->rl.rb_gc_max;
+ rb_gc_rsv = pblk->rl.rb_gc_rsv;
+ rb_gc_cnt = atomic_read(&pblk->rl.rb_gc_cnt);
+ rb_budget = pblk->rl.rb_budget;
+ rb_state = pblk->rl.rb_state;
+
+ total_blocks = geo->blks_per_lun * geo->nr_luns;
+
+ return snprintf(page, PAGE_SIZE,
+ "u:%u/%u,gc:%u/%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
+ rb_user_cnt,
+ rb_user_max,
+ rb_gc_cnt,
+ rb_gc_max,
+ rb_gc_rsv,
+ rb_state,
+ rb_budget,
+ pblk->rl.low,
+ pblk->rl.high,
+ free_blocks,
+ total_blocks,
+ READ_ONCE(pblk->rl.rb_user_active));
+}
+
+static ssize_t pblk_sysfs_gc_state_show(struct pblk *pblk, char *page)
+{
+ int gc_enabled, gc_active;
+
+ pblk_gc_sysfs_state_show(pblk, &gc_enabled, &gc_active);
+ return snprintf(page, PAGE_SIZE, "gc_enabled=%d, gc_active=%d\n",
+ gc_enabled, gc_active);
+}
+
+static ssize_t pblk_sysfs_stats(struct pblk *pblk, char *page)
+{
+ ssize_t sz;
+
+ sz = snprintf(page, PAGE_SIZE,
+ "read_failed=%lu, read_high_ecc=%lu, read_empty=%lu, read_failed_gc=%lu, write_failed=%lu, erase_failed=%lu\n",
+ atomic_long_read(&pblk->read_failed),
+ atomic_long_read(&pblk->read_high_ecc),
+ atomic_long_read(&pblk->read_empty),
+ atomic_long_read(&pblk->read_failed_gc),
+ atomic_long_read(&pblk->write_failed),
+ atomic_long_read(&pblk->erase_failed));
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_write_buffer(struct pblk *pblk, char *page)
+{
+ return pblk_rb_sysfs(&pblk->rwb, page);
+}
+
+static ssize_t pblk_sysfs_ppaf(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ ssize_t sz = 0;
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "g:(b:%d)blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+ pblk->ppaf_bitsize,
+ pblk->ppaf.blk_offset, geo->ppaf.blk_len,
+ pblk->ppaf.pg_offset, geo->ppaf.pg_len,
+ pblk->ppaf.lun_offset, geo->ppaf.lun_len,
+ pblk->ppaf.ch_offset, geo->ppaf.ch_len,
+ pblk->ppaf.pln_offset, geo->ppaf.pln_len,
+ pblk->ppaf.sec_offset, geo->ppaf.sect_len);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "d:blk:%d/%d,pg:%d/%d,lun:%d/%d,ch:%d/%d,pl:%d/%d,sec:%d/%d\n",
+ geo->ppaf.blk_offset, geo->ppaf.blk_len,
+ geo->ppaf.pg_offset, geo->ppaf.pg_len,
+ geo->ppaf.lun_offset, geo->ppaf.lun_len,
+ geo->ppaf.ch_offset, geo->ppaf.ch_len,
+ geo->ppaf.pln_offset, geo->ppaf.pln_len,
+ geo->ppaf.sect_offset, geo->ppaf.sect_len);
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line *line;
+ ssize_t sz = 0;
+ int nr_free_lines;
+ int cur_data, cur_log;
+ int free_line_cnt = 0, closed_line_cnt = 0;
+ int d_line_cnt = 0, l_line_cnt = 0;
+ int gc_full = 0, gc_high = 0, gc_mid = 0, gc_low = 0, gc_empty = 0;
+ int free = 0, bad = 0, cor = 0;
+ int msecs = 0, ssecs = 0, cur_sec = 0, vsc = 0, sec_in_line = 0;
+ int map_weight = 0, meta_weight = 0;
+
+ spin_lock(&l_mg->free_lock);
+ cur_data = (l_mg->data_line) ? l_mg->data_line->id : -1;
+ cur_log = (l_mg->log_line) ? l_mg->log_line->id : -1;
+ nr_free_lines = l_mg->nr_free_lines;
+
+ list_for_each_entry(line, &l_mg->free_list, list)
+ free_line_cnt++;
+ spin_unlock(&l_mg->free_lock);
+
+ spin_lock(&l_mg->gc_lock);
+ list_for_each_entry(line, &l_mg->gc_full_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_full++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_high_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_high++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_mid_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_mid++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_low_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_low++;
+ }
+
+ list_for_each_entry(line, &l_mg->gc_empty_list, list) {
+ if (line->type == PBLK_LINETYPE_DATA)
+ d_line_cnt++;
+ else if (line->type == PBLK_LINETYPE_LOG)
+ l_line_cnt++;
+ closed_line_cnt++;
+ gc_empty++;
+ }
+
+ list_for_each_entry(line, &l_mg->free_list, list)
+ free++;
+ list_for_each_entry(line, &l_mg->bad_list, list)
+ bad++;
+ list_for_each_entry(line, &l_mg->corrupt_list, list)
+ cor++;
+ spin_unlock(&l_mg->gc_lock);
+
+ spin_lock(&l_mg->free_lock);
+ if (l_mg->data_line) {
+ cur_sec = l_mg->data_line->cur_sec;
+ msecs = l_mg->data_line->left_msecs;
+ ssecs = l_mg->data_line->left_ssecs;
+ vsc = l_mg->data_line->vsc;
+ sec_in_line = l_mg->data_line->sec_in_line;
+ meta_weight = bitmap_weight(&l_mg->meta_bitmap,
+ PBLK_DATA_LINES);
+ map_weight = bitmap_weight(l_mg->data_line->map_bitmap,
+ lm->sec_per_line);
+ }
+ spin_unlock(&l_mg->free_lock);
+
+ if (nr_free_lines != free_line_cnt)
+ pr_err("pblk: corrupted free line list\n");
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "line: nluns:%d, nblks:%d, nsecs:%d\n",
+ geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "lines:d:%d,l:%d-f:%d(%d),b:%d,co:%d,c:%d(d:%d,l:%d)t:%d\n",
+ cur_data, cur_log,
+ free, nr_free_lines, bad, cor,
+ closed_line_cnt,
+ d_line_cnt, l_line_cnt,
+ l_mg->nr_lines);
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n",
+ gc_full, gc_high, gc_mid, gc_low, gc_empty,
+ atomic_read(&pblk->gc.inflight_gc));
+
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "data (%d) cur:%d, left:%d/%d, vsc:%d, s:%d, map:%d/%d (%d)\n",
+ cur_data, cur_sec, msecs, ssecs, vsc, sec_in_line,
+ map_weight, lm->sec_per_line, meta_weight);
+
+ return sz;
+}
+
+static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_meta *lm = &pblk->lm;
+ ssize_t sz = 0;
+
+ sz = snprintf(page, PAGE_SIZE - sz,
+ "smeta - len:%d, secs:%d\n",
+ lm->smeta_len, lm->smeta_sec);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "emeta - len:%d, sec:%d, bb_start:%d\n",
+ lm->emeta_len, lm->emeta_sec,
+ lm->emeta_bb);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "bitmap lengths: sec:%d, blk:%d, lun:%d\n",
+ lm->sec_bitmap_len,
+ lm->blk_bitmap_len,
+ lm->lun_bitmap_len);
+ sz += snprintf(page + sz, PAGE_SIZE - sz,
+ "blk_line:%d, sec_line:%d, sec_blk:%d\n",
+ lm->blk_per_line,
+ lm->sec_per_line,
+ geo->sec_per_blk);
+
+ return sz;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static ssize_t pblk_sysfs_stats_debug(struct pblk *pblk, char *page)
+{
+ return snprintf(page, PAGE_SIZE,
+ "%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\t%lu\n",
+ atomic_long_read(&pblk->inflight_writes),
+ atomic_long_read(&pblk->inflight_reads),
+ atomic_long_read(&pblk->req_writes),
+ atomic_long_read(&pblk->nr_flush),
+ atomic_long_read(&pblk->padded_writes),
+ atomic_long_read(&pblk->padded_wb),
+ atomic_long_read(&pblk->sub_writes),
+ atomic_long_read(&pblk->sync_writes),
+ atomic_long_read(&pblk->compl_writes),
+ atomic_long_read(&pblk->recov_writes),
+ atomic_long_read(&pblk->recov_gc_writes),
+ atomic_long_read(&pblk->recov_gc_reads),
+ atomic_long_read(&pblk->sync_reads));
+}
+#endif
+
+static ssize_t pblk_sysfs_rate_store(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ struct pblk_gc *gc = &pblk->gc;
+ size_t c_len;
+ int value;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &value))
+ return -EINVAL;
+
+ spin_lock(&gc->lock);
+ pblk_rl_set_gc_rsc(&pblk->rl, value);
+ spin_unlock(&gc->lock);
+
+ return len;
+}
+
+static ssize_t pblk_sysfs_gc_force(struct pblk *pblk, const char *page,
+ size_t len)
+{
+ size_t c_len;
+ int force;
+
+ c_len = strcspn(page, "\n");
+ if (c_len >= len)
+ return -EINVAL;
+
+ if (kstrtouint(page, 0, &force))
+ return -EINVAL;
+
+ if (force < 0 || force > 1)
+ return -EINVAL;
+
+ pblk_gc_sysfs_force(pblk, force);
+
+ return len;
+}
+
+static struct attribute sys_write_luns = {
+ .name = "write_luns",
+ .mode = 0444,
+};
+
+static struct attribute sys_rate_limiter_attr = {
+ .name = "rate_limiter",
+ .mode = 0444,
+};
+
+static struct attribute sys_gc_state = {
+ .name = "gc_state",
+ .mode = 0444,
+};
+
+static struct attribute sys_errors_attr = {
+ .name = "errors",
+ .mode = 0444,
+};
+
+static struct attribute sys_rb_attr = {
+ .name = "write_buffer",
+ .mode = 0444,
+};
+
+static struct attribute sys_stats_ppaf_attr = {
+ .name = "ppa_format",
+ .mode = 0444,
+};
+
+static struct attribute sys_lines_attr = {
+ .name = "lines",
+ .mode = 0444,
+};
+
+static struct attribute sys_lines_info_attr = {
+ .name = "lines_info",
+ .mode = 0444,
+};
+
+static struct attribute sys_gc_force = {
+ .name = "gc_force",
+ .mode = 0200,
+};
+
+static struct attribute sys_gc_rl_max = {
+ .name = "gc_rl_max",
+ .mode = 0200,
+};
+
+#ifdef CONFIG_NVM_DEBUG
+static struct attribute sys_stats_debug_attr = {
+ .name = "stats",
+ .mode = 0444,
+};
+#endif
+
+static struct attribute *pblk_attrs[] = {
+ &sys_write_luns,
+ &sys_rate_limiter_attr,
+ &sys_errors_attr,
+ &sys_gc_state,
+ &sys_gc_force,
+ &sys_gc_rl_max,
+ &sys_rb_attr,
+ &sys_stats_ppaf_attr,
+ &sys_lines_attr,
+ &sys_lines_info_attr,
+#ifdef CONFIG_NVM_DEBUG
+ &sys_stats_debug_attr,
+#endif
+ NULL,
+};
+
+static ssize_t pblk_sysfs_show(struct kobject *kobj, struct attribute *attr,
+ char *buf)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "rate_limiter") == 0)
+ return pblk_sysfs_rate_limiter(pblk, buf);
+ else if (strcmp(attr->name, "write_luns") == 0)
+ return pblk_sysfs_luns_show(pblk, buf);
+ else if (strcmp(attr->name, "gc_state") == 0)
+ return pblk_sysfs_gc_state_show(pblk, buf);
+ else if (strcmp(attr->name, "errors") == 0)
+ return pblk_sysfs_stats(pblk, buf);
+ else if (strcmp(attr->name, "write_buffer") == 0)
+ return pblk_sysfs_write_buffer(pblk, buf);
+ else if (strcmp(attr->name, "ppa_format") == 0)
+ return pblk_sysfs_ppaf(pblk, buf);
+ else if (strcmp(attr->name, "lines") == 0)
+ return pblk_sysfs_lines(pblk, buf);
+ else if (strcmp(attr->name, "lines_info") == 0)
+ return pblk_sysfs_lines_info(pblk, buf);
+#ifdef CONFIG_NVM_DEBUG
+ else if (strcmp(attr->name, "stats") == 0)
+ return pblk_sysfs_stats_debug(pblk, buf);
+#endif
+ return 0;
+}
+
+static ssize_t pblk_sysfs_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct pblk *pblk = container_of(kobj, struct pblk, kobj);
+
+ if (strcmp(attr->name, "gc_rl_max") == 0)
+ return pblk_sysfs_rate_store(pblk, buf, len);
+ else if (strcmp(attr->name, "gc_force") == 0)
+ return pblk_sysfs_gc_force(pblk, buf, len);
+
+ return 0;
+}
+
+static const struct sysfs_ops pblk_sysfs_ops = {
+ .show = pblk_sysfs_show,
+ .store = pblk_sysfs_store,
+};
+
+static struct kobj_type pblk_ktype = {
+ .sysfs_ops = &pblk_sysfs_ops,
+ .default_attrs = pblk_attrs,
+};
+
+int pblk_sysfs_init(struct gendisk *tdisk)
+{
+ struct pblk *pblk = tdisk->private_data;
+ struct device *parent_dev = disk_to_dev(pblk->disk);
+ int ret;
+
+ ret = kobject_init_and_add(&pblk->kobj, &pblk_ktype,
+ kobject_get(&parent_dev->kobj),
+ "%s", "pblk");
+ if (ret) {
+ pr_err("pblk: could not register %s/pblk\n",
+ tdisk->disk_name);
+ return ret;
+ }
+
+ kobject_uevent(&pblk->kobj, KOBJ_ADD);
+ return 0;
+}
+
+void pblk_sysfs_exit(struct gendisk *tdisk)
+{
+ struct pblk *pblk = tdisk->private_data;
+
+ kobject_uevent(&pblk->kobj, KOBJ_REMOVE);
+ kobject_del(&pblk->kobj);
+ kobject_put(&pblk->kobj);
+}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
new file mode 100644
index 0000000000000..aef6fd7c4a0cb
--- /dev/null
+++ b/drivers/lightnvm/pblk-write.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Javier Gonzalez <javier@cnexlabs.com>
+ * Matias Bjorling <matias@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * pblk-write.c - pblk's write path from write buffer to media
+ */
+
+#include "pblk.h"
+
+static void pblk_sync_line(struct pblk *pblk, struct pblk_line *line)
+{
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_inc(&pblk->sync_writes);
+#endif
+
+ /* Counter protected by rb sync lock */
+ line->left_ssecs--;
+ if (!line->left_ssecs)
+ pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws);
+}
+
+static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct bio *original_bio;
+ unsigned long ret;
+ int i;
+
+ for (i = 0; i < c_ctx->nr_valid; i++) {
+ struct pblk_w_ctx *w_ctx;
+ struct ppa_addr p;
+ struct pblk_line *line;
+
+ w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
+
+ p = rqd->ppa_list[i];
+ line = &pblk->lines[pblk_dev_ppa_to_line(p)];
+ pblk_sync_line(pblk, line);
+
+ while ((original_bio = bio_list_pop(&w_ctx->bios)))
+ bio_endio(original_bio);
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(c_ctx->nr_valid, &pblk->compl_writes);
+#endif
+
+ ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid);
+
+ if (rqd->meta_list)
+ nvm_dev_dma_free(dev->parent, rqd->meta_list,
+ rqd->dma_meta_list);
+
+ bio_put(rqd->bio);
+ pblk_free_rqd(pblk, rqd, WRITE);
+
+ return ret;
+}
+
+static unsigned long pblk_end_queued_w_bio(struct pblk *pblk,
+ struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ list_del(&c_ctx->list);
+ return pblk_end_w_bio(pblk, rqd, c_ctx);
+}
+
+static void pblk_complete_write(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_c_ctx *c, *r;
+ unsigned long flags;
+ unsigned long pos;
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_sub(c_ctx->nr_valid, &pblk->inflight_writes);
+#endif
+
+ pblk_up_rq(pblk, rqd->ppa_list, rqd->nr_ppas, c_ctx->lun_bitmap);
+
+ pos = pblk_rb_sync_init(&pblk->rwb, &flags);
+ if (pos == c_ctx->sentry) {
+ pos = pblk_end_w_bio(pblk, rqd, c_ctx);
+
+retry:
+ list_for_each_entry_safe(c, r, &pblk->compl_list, list) {
+ rqd = nvm_rq_from_c_ctx(c);
+ if (c->sentry == pos) {
+ pos = pblk_end_queued_w_bio(pblk, rqd, c);
+ goto retry;
+ }
+ }
+ } else {
+ WARN_ON(nvm_rq_from_c_ctx(c_ctx) != rqd);
+ list_add_tail(&c_ctx->list, &pblk->compl_list);
+ }
+ pblk_rb_sync_end(&pblk->rwb, &flags);
+}
+
+/* When a write fails, we are not sure whether the block has grown bad or a page
+ * range is more susceptible to write errors. If a high number of pages fail, we
+ * assume that the block is bad and we mark it accordingly. In all cases, we
+ * remap and resubmit the failed entries as fast as possible; if a flush is
+ * waiting on a completion, the whole stack would stall otherwise.
+ */
+static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd)
+{
+ void *comp_bits = &rqd->ppa_status;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+ struct pblk_rec_ctx *recovery;
+ struct ppa_addr *ppa_list = rqd->ppa_list;
+ int nr_ppas = rqd->nr_ppas;
+ unsigned int c_entries;
+ int bit, ret;
+
+ if (unlikely(nr_ppas == 1))
+ ppa_list = &rqd->ppa_addr;
+
+ recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC);
+ if (!recovery) {
+ pr_err("pblk: could not allocate recovery context\n");
+ return;
+ }
+ INIT_LIST_HEAD(&recovery->failed);
+
+ bit = -1;
+ while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
+ struct pblk_rb_entry *entry;
+ struct ppa_addr ppa;
+
+ /* Logic error */
+ if (bit > c_ctx->nr_valid) {
+ WARN_ONCE(1, "pblk: corrupted write request\n");
+ mempool_free(recovery, pblk->rec_pool);
+ goto out;
+ }
+
+ ppa = ppa_list[bit];
+ entry = pblk_rb_sync_scan_entry(&pblk->rwb, &ppa);
+ if (!entry) {
+ pr_err("pblk: could not scan entry on write failure\n");
+ mempool_free(recovery, pblk->rec_pool);
+ goto out;
+ }
+
+ /* The list is filled first and emptied afterwards. No need for
+ * protecting it with a lock
+ */
+ list_add_tail(&entry->index, &recovery->failed);
+ }
+
+ c_entries = find_first_bit(comp_bits, nr_ppas);
+ ret = pblk_recov_setup_rq(pblk, c_ctx, recovery, comp_bits, c_entries);
+ if (ret) {
+ pr_err("pblk: could not recover from write failure\n");
+ mempool_free(recovery, pblk->rec_pool);
+ goto out;
+ }
+
+ INIT_WORK(&recovery->ws_rec, pblk_submit_rec);
+ queue_work(pblk->kw_wq, &recovery->ws_rec);
+
+out:
+ pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static void pblk_end_io_write(struct nvm_rq *rqd)
+{
+ struct pblk *pblk = rqd->private;
+ struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
+
+ if (rqd->error) {
+ pblk_log_write_err(pblk, rqd);
+ return pblk_end_w_fail(pblk, rqd);
+ }
+#ifdef CONFIG_NVM_DEBUG
+ else
+ WARN_ONCE(rqd->bio->bi_error, "pblk: corrupted write error\n");
+#endif
+
+ pblk_complete_write(pblk, rqd, c_ctx);
+}
+
+static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int nr_secs)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+
+ /* Setup write request */
+ rqd->opcode = NVM_OP_PWRITE;
+ rqd->nr_ppas = nr_secs;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+ rqd->private = pblk;
+ rqd->end_io = pblk_end_io_write;
+
+ rqd->meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
+ &rqd->dma_meta_list);
+ if (!rqd->meta_list)
+ return -ENOMEM;
+
+ if (unlikely(nr_secs == 1))
+ return 0;
+
+ rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
+ rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+
+ return 0;
+}
+
+static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ struct pblk_line *e_line = pblk_line_get_data_next(pblk);
+ struct ppa_addr erase_ppa;
+ unsigned int valid = c_ctx->nr_valid;
+ unsigned int padded = c_ctx->nr_padded;
+ unsigned int nr_secs = valid + padded;
+ unsigned long *lun_bitmap;
+ int ret = 0;
+
+ lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+ if (!lun_bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ c_ctx->lun_bitmap = lun_bitmap;
+
+ ret = pblk_alloc_w_rq(pblk, rqd, nr_secs);
+ if (ret) {
+ kfree(lun_bitmap);
+ goto out;
+ }
+
+ ppa_set_empty(&erase_ppa);
+ if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
+ pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+ else
+ pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+ valid, &erase_ppa);
+
+out:
+ if (unlikely(e_line && !ppa_empty(erase_ppa))) {
+ if (pblk_blk_erase_async(pblk, erase_ppa)) {
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int bit;
+
+ atomic_inc(&e_line->left_eblks);
+ bit = erase_ppa.g.lun * geo->nr_chnls + erase_ppa.g.ch;
+ WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
+ up(&pblk->erase_sem);
+ }
+ }
+
+ return ret;
+}
+
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ unsigned long *lun_bitmap;
+ int ret;
+
+ lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL);
+ if (!lun_bitmap)
+ return -ENOMEM;
+
+ c_ctx->lun_bitmap = lun_bitmap;
+
+ ret = pblk_alloc_w_rq(pblk, rqd, rqd->nr_ppas);
+ if (ret)
+ return ret;
+
+ pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0);
+
+ rqd->ppa_status = (u64)0;
+ rqd->flags = pblk_set_progr_mode(pblk, WRITE);
+
+ return ret;
+}
+
+static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
+ unsigned int secs_to_flush)
+{
+ int secs_to_sync;
+
+ secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+
+#ifdef CONFIG_NVM_DEBUG
+ if ((!secs_to_sync && secs_to_flush)
+ || (secs_to_sync < 0)
+ || (secs_to_sync > secs_avail && !secs_to_flush)) {
+ pr_err("pblk: bad sector calculation (a:%d,s:%d,f:%d)\n",
+ secs_avail, secs_to_sync, secs_to_flush);
+ }
+#endif
+
+ return secs_to_sync;
+}
+
+static int pblk_submit_write(struct pblk *pblk)
+{
+ struct bio *bio;
+ struct nvm_rq *rqd;
+ struct pblk_c_ctx *c_ctx;
+ unsigned int pgs_read;
+ unsigned int secs_avail, secs_to_sync, secs_to_com;
+ unsigned int secs_to_flush;
+ unsigned long pos;
+ int err;
+
+ /* If there are no sectors in the cache, flushes (bios without data)
+ * will be cleared on the cache threads
+ */
+ secs_avail = pblk_rb_read_count(&pblk->rwb);
+ if (!secs_avail)
+ return 1;
+
+ secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+ if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+ return 1;
+
+ rqd = pblk_alloc_rqd(pblk, WRITE);
+ if (IS_ERR(rqd)) {
+ pr_err("pblk: cannot allocate write req.\n");
+ return 1;
+ }
+ c_ctx = nvm_rq_to_pdu(rqd);
+
+ bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs);
+ if (!bio) {
+ pr_err("pblk: cannot allocate write bio\n");
+ goto fail_free_rqd;
+ }
+ bio->bi_iter.bi_sector = 0; /* internal bio */
+ bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ rqd->bio = bio;
+
+ secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush);
+ if (secs_to_sync > pblk->max_write_pgs) {
+ pr_err("pblk: bad buffer sync calculation\n");
+ goto fail_put_bio;
+ }
+
+ secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync;
+ pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
+
+ pgs_read = pblk_rb_read_to_bio(&pblk->rwb, bio, c_ctx, pos,
+ secs_to_sync, secs_avail);
+ if (!pgs_read) {
+ pr_err("pblk: corrupted write bio\n");
+ goto fail_put_bio;
+ }
+
+ if (c_ctx->nr_padded)
+ if (pblk_bio_add_pages(pblk, bio, GFP_KERNEL, c_ctx->nr_padded))
+ goto fail_put_bio;
+
+ /* Assign lbas to ppas and populate request structure */
+ err = pblk_setup_w_rq(pblk, rqd, c_ctx);
+ if (err) {
+ pr_err("pblk: could not setup write request\n");
+ goto fail_free_bio;
+ }
+
+ err = pblk_submit_io(pblk, rqd);
+ if (err) {
+ pr_err("pblk: I/O submission failed: %d\n", err);
+ goto fail_free_bio;
+ }
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_long_add(secs_to_sync, &pblk->sub_writes);
+#endif
+
+ return 0;
+
+fail_free_bio:
+ if (c_ctx->nr_padded)
+ pblk_bio_free_pages(pblk, bio, secs_to_sync, c_ctx->nr_padded);
+fail_put_bio:
+ bio_put(bio);
+fail_free_rqd:
+ pblk_free_rqd(pblk, rqd, WRITE);
+
+ return 1;
+}
+
+int pblk_write_ts(void *data)
+{
+ struct pblk *pblk = data;
+
+ while (!kthread_should_stop()) {
+ if (!pblk_submit_write(pblk))
+ continue;
+ set_current_state(TASK_INTERRUPTIBLE);
+ io_schedule();
+ }
+
+ return 0;
+}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
new file mode 100644
index 0000000000000..99f3186b5288b
--- /dev/null
+++ b/drivers/lightnvm/pblk.h
@@ -0,0 +1,1121 @@
+/*
+ * Copyright (C) 2015 IT University of Copenhagen (rrpc.h)
+ * Copyright (C) 2016 CNEX Labs
+ * Initial release: Matias Bjorling <matias@cnexlabs.com>
+ * Write buffering: Javier Gonzalez <javier@cnexlabs.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * Implementation of a Physical Block-device target for Open-channel SSDs.
+ *
+ */
+
+#ifndef PBLK_H_
+#define PBLK_H_
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+#include <linux/crc32.h>
+#include <linux/uuid.h>
+
+#include <linux/lightnvm.h>
+
+/* Run only GC if less than 1/X blocks are free */
+#define GC_LIMIT_INVERSE 5
+#define GC_TIME_MSECS 1000
+
+#define PBLK_SECTOR (512)
+#define PBLK_EXPOSED_PAGE_SIZE (4096)
+#define PBLK_MAX_REQ_ADDRS (64)
+#define PBLK_MAX_REQ_ADDRS_PW (6)
+
+#define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16)
+
+#define PBLK_COMMAND_TIMEOUT_MS 30000
+
+/* Max 512 LUNs per device */
+#define PBLK_MAX_LUNS_BITMAP (4)
+
+#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
+
+#define pblk_for_each_lun(pblk, rlun, i) \
+ for ((i) = 0, rlun = &(pblk)->luns[0]; \
+ (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
+
+#define ERASE 2 /* READ = 0, WRITE = 1 */
+
+enum {
+ /* IO Types */
+ PBLK_IOTYPE_USER = 1 << 0,
+ PBLK_IOTYPE_GC = 1 << 1,
+
+ /* Write buffer flags */
+ PBLK_FLUSH_ENTRY = 1 << 2,
+ PBLK_WRITTEN_DATA = 1 << 3,
+ PBLK_SUBMITTED_ENTRY = 1 << 4,
+ PBLK_WRITABLE_ENTRY = 1 << 5,
+};
+
+enum {
+ PBLK_BLK_ST_OPEN = 0x1,
+ PBLK_BLK_ST_CLOSED = 0x2,
+};
+
+/* The number of GC lists and the rate-limiter states go together. This way the
+ * rate-limiter can dictate how much GC is needed based on resource utilization.
+ */
+#define PBLK_NR_GC_LISTS 3
+#define PBLK_MAX_GC_JOBS 32
+
+enum {
+ PBLK_RL_HIGH = 1,
+ PBLK_RL_MID = 2,
+ PBLK_RL_LOW = 3,
+};
+
+struct pblk_sec_meta {
+ u64 reserved;
+ __le64 lba;
+};
+
+#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS)
+
+/* write completion context */
+struct pblk_c_ctx {
+ struct list_head list; /* Head for out-of-order completion */
+
+ unsigned long *lun_bitmap; /* Luns used on current request */
+ unsigned int sentry;
+ unsigned int nr_valid;
+ unsigned int nr_padded;
+};
+
+/* Read context */
+struct pblk_r_ctx {
+ struct bio *orig_bio;
+};
+
+/* Recovery context */
+struct pblk_rec_ctx {
+ struct pblk *pblk;
+ struct nvm_rq *rqd;
+ struct list_head failed;
+ struct work_struct ws_rec;
+};
+
+/* Write context */
+struct pblk_w_ctx {
+ struct bio_list bios; /* Original bios - used for completion
+ * in REQ_FUA, REQ_FLUSH case
+ */
+ u64 lba; /* Logic addr. associated with entry */
+ struct ppa_addr ppa; /* Physic addr. associated with entry */
+ int flags; /* Write context flags */
+};
+
+struct pblk_rb_entry {
+ struct ppa_addr cacheline; /* Cacheline for this entry */
+ void *data; /* Pointer to data on this entry */
+ struct pblk_w_ctx w_ctx; /* Context for this entry */
+ struct list_head index; /* List head to enable indexes */
+};
+
+#define EMPTY_ENTRY (~0U)
+
+struct pblk_rb_pages {
+ struct page *pages;
+ int order;
+ struct list_head list;
+};
+
+struct pblk_rb {
+ struct pblk_rb_entry *entries; /* Ring buffer entries */
+ unsigned int mem; /* Write offset - points to next
+ * writable entry in memory
+ */
+ unsigned int subm; /* Read offset - points to last entry
+ * that has been submitted to the media
+ * to be persisted
+ */
+ unsigned int sync; /* Synced - backpointer that signals
+ * the last submitted entry that has
+ * been successfully persisted to media
+ */
+ unsigned int sync_point; /* Sync point - last entry that must be
+ * flushed to the media. Used with
+ * REQ_FLUSH and REQ_FUA
+ */
+ unsigned int l2p_update; /* l2p update point - next entry for
+ * which l2p mapping will be updated to
+ * contain a device ppa address (instead
+ * of a cacheline
+ */
+ unsigned int nr_entries; /* Number of entries in write buffer -
+ * must be a power of two
+ */
+ unsigned int seg_size; /* Size of the data segments being
+ * stored on each entry. Typically this
+ * will be 4KB
+ */
+
+ struct list_head pages; /* List of data pages */
+
+ spinlock_t w_lock; /* Write lock */
+ spinlock_t s_lock; /* Sync lock */
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */
+#endif
+};
+
+#define PBLK_RECOVERY_SECTORS 16
+
+struct pblk_lun {
+ struct ppa_addr bppa;
+
+ u8 *bb_list; /* Bad block list for LUN. Only used on
+ * bring up. Bad blocks are managed
+ * within lines on run-time.
+ */
+
+ struct semaphore wr_sem;
+};
+
+struct pblk_gc_rq {
+ struct pblk_line *line;
+ void *data;
+ u64 *lba_list;
+ int nr_secs;
+ int secs_to_gc;
+ struct list_head list;
+};
+
+struct pblk_gc {
+ int gc_active;
+ int gc_enabled;
+ int gc_forced;
+ int gc_jobs_active;
+ atomic_t inflight_gc;
+
+ struct task_struct *gc_ts;
+ struct task_struct *gc_writer_ts;
+ struct workqueue_struct *gc_reader_wq;
+ struct timer_list gc_timer;
+
+ int w_entries;
+ struct list_head w_list;
+
+ spinlock_t lock;
+ spinlock_t w_lock;
+};
+
+struct pblk_rl {
+ unsigned int high; /* Upper threshold for rate limiter (free run -
+ * user I/O rate limiter
+ */
+ unsigned int low; /* Lower threshold for rate limiter (user I/O
+ * rate limiter - stall)
+ */
+ unsigned int high_pw; /* High rounded up as a power of 2 */
+
+#define PBLK_USER_HIGH_THRS 2 /* Begin write limit at 50 percent
+ * available blks
+ */
+#define PBLK_USER_LOW_THRS 20 /* Aggressive GC at 5% available blocks */
+
+ int rb_windows_pw; /* Number of rate windows in the write buffer
+ * given as a power-of-2. This guarantees that
+ * when user I/O is being rate limited, there
+ * will be reserved enough space for the GC to
+ * place its payload. A window is of
+ * pblk->max_write_pgs size, which in NVMe is
+ * 64, i.e., 256kb.
+ */
+ int rb_budget; /* Total number of entries available for I/O */
+ int rb_user_max; /* Max buffer entries available for user I/O */
+ atomic_t rb_user_cnt; /* User I/O buffer counter */
+ int rb_gc_max; /* Max buffer entries available for GC I/O */
+ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */
+ int rb_state; /* Rate-limiter current state */
+ atomic_t rb_gc_cnt; /* GC I/O buffer counter */
+
+ int rb_user_active;
+ struct timer_list u_timer;
+
+ unsigned long long nr_secs;
+ unsigned long total_blocks;
+ atomic_t free_blocks;
+};
+
+#define PBLK_LINE_NR_LUN_BITMAP 2
+#define PBLK_LINE_NR_SEC_BITMAP 2
+#define PBLK_LINE_EMPTY (~0U)
+
+enum {
+ /* Line Types */
+ PBLK_LINETYPE_FREE = 0,
+ PBLK_LINETYPE_LOG = 1,
+ PBLK_LINETYPE_DATA = 2,
+
+ /* Line state */
+ PBLK_LINESTATE_FREE = 10,
+ PBLK_LINESTATE_OPEN = 11,
+ PBLK_LINESTATE_CLOSED = 12,
+ PBLK_LINESTATE_GC = 13,
+ PBLK_LINESTATE_BAD = 14,
+ PBLK_LINESTATE_CORRUPT = 15,
+
+ /* GC group */
+ PBLK_LINEGC_NONE = 20,
+ PBLK_LINEGC_EMPTY = 21,
+ PBLK_LINEGC_LOW = 22,
+ PBLK_LINEGC_MID = 23,
+ PBLK_LINEGC_HIGH = 24,
+ PBLK_LINEGC_FULL = 25,
+};
+
+#define PBLK_MAGIC 0x70626c6b /*pblk*/
+
+struct line_header {
+ __le32 crc;
+ __le32 identifier; /* pblk identifier */
+ __u8 uuid[16]; /* instance uuid */
+ __le16 type; /* line type */
+ __le16 version; /* type version */
+ __le32 id; /* line id for current line */
+};
+
+struct line_smeta {
+ struct line_header header;
+
+ __le32 crc; /* Full structure including struct crc */
+ /* Previous line metadata */
+ __le32 prev_id; /* Line id for previous line */
+
+ /* Current line metadata */
+ __le64 seq_nr; /* Sequence number for current line */
+
+ /* Active writers */
+ __le32 window_wr_lun; /* Number of parallel LUNs to write */
+
+ __le32 rsvd[2];
+};
+
+/*
+ * Metadata Layout:
+ * 1. struct pblk_emeta
+ * 2. nr_lbas u64 forming lba list
+ * 3. nr_lines (all) u32 valid sector count (vsc) (~0U: non-alloc line)
+ * 4. nr_luns bits (u64 format) forming line bad block bitmap
+ *
+ * 3. and 4. will be part of FTL log
+ */
+struct line_emeta {
+ struct line_header header;
+
+ __le32 crc; /* Full structure including struct crc */
+
+ /* Previous line metadata */
+ __le32 prev_id; /* Line id for prev line */
+
+ /* Current line metadata */
+ __le64 seq_nr; /* Sequence number for current line */
+
+ /* Active writers */
+ __le32 window_wr_lun; /* Number of parallel LUNs to write */
+
+ /* Bookkeeping for recovery */
+ __le32 next_id; /* Line id for next line */
+ __le64 nr_lbas; /* Number of lbas mapped in line */
+ __le64 nr_valid_lbas; /* Number of valid lbas mapped in line */
+};
+
+struct pblk_line {
+ struct pblk *pblk;
+ unsigned int id; /* Line number corresponds to the
+ * block line
+ */
+ unsigned int seq_nr; /* Unique line sequence number */
+
+ int state; /* PBLK_LINESTATE_X */
+ int type; /* PBLK_LINETYPE_X */
+ int gc_group; /* PBLK_LINEGC_X */
+ struct list_head list; /* Free, GC lists */
+
+ unsigned long *lun_bitmap; /* Bitmap for LUNs mapped in line */
+
+ struct line_smeta *smeta; /* Start metadata */
+ struct line_emeta *emeta; /* End metadata */
+ int meta_line; /* Metadata line id */
+ u64 smeta_ssec; /* Sector where smeta starts */
+ u64 emeta_ssec; /* Sector where emeta starts */
+
+ unsigned int sec_in_line; /* Number of usable secs in line */
+
+ atomic_t blk_in_line; /* Number of good blocks in line */
+ unsigned long *blk_bitmap; /* Bitmap for valid/invalid blocks */
+ unsigned long *erase_bitmap; /* Bitmap for erased blocks */
+
+ unsigned long *map_bitmap; /* Bitmap for mapped sectors in line */
+ unsigned long *invalid_bitmap; /* Bitmap for invalid sectors in line */
+
+ atomic_t left_eblks; /* Blocks left for erasing */
+ atomic_t left_seblks; /* Blocks left for sync erasing */
+
+ int left_msecs; /* Sectors left for mapping */
+ int left_ssecs; /* Sectors left to sync */
+ unsigned int cur_sec; /* Sector map pointer */
+ unsigned int vsc; /* Valid sector count in line */
+
+ struct kref ref; /* Write buffer L2P references */
+
+ spinlock_t lock; /* Necessary for invalid_bitmap only */
+};
+
+#define PBLK_DATA_LINES 4
+
+enum{
+ PBLK_KMALLOC_META = 1,
+ PBLK_VMALLOC_META = 2,
+};
+
+struct pblk_line_metadata {
+ void *meta;
+};
+
+struct pblk_line_mgmt {
+ int nr_lines; /* Total number of full lines */
+ int nr_free_lines; /* Number of full lines in free list */
+
+ /* Free lists - use free_lock */
+ struct list_head free_list; /* Full lines ready to use */
+ struct list_head corrupt_list; /* Full lines corrupted */
+ struct list_head bad_list; /* Full lines bad */
+
+ /* GC lists - use gc_lock */
+ struct list_head *gc_lists[PBLK_NR_GC_LISTS];
+ struct list_head gc_high_list; /* Full lines ready to GC, high isc */
+ struct list_head gc_mid_list; /* Full lines ready to GC, mid isc */
+ struct list_head gc_low_list; /* Full lines ready to GC, low isc */
+
+ struct list_head gc_full_list; /* Full lines ready to GC, no valid */
+ struct list_head gc_empty_list; /* Full lines close, all valid */
+
+ struct pblk_line *log_line; /* Current FTL log line */
+ struct pblk_line *data_line; /* Current data line */
+ struct pblk_line *log_next; /* Next FTL log line */
+ struct pblk_line *data_next; /* Next data line */
+
+ /* Metadata allocation type: VMALLOC | KMALLOC */
+ int smeta_alloc_type;
+ int emeta_alloc_type;
+
+ /* Pre-allocated metadata for data lines */
+ struct pblk_line_metadata sline_meta[PBLK_DATA_LINES];
+ struct pblk_line_metadata eline_meta[PBLK_DATA_LINES];
+ unsigned long meta_bitmap;
+
+ /* Helpers for fast bitmap calculations */
+ unsigned long *bb_template;
+ unsigned long *bb_aux;
+
+ unsigned long d_seq_nr; /* Data line unique sequence number */
+ unsigned long l_seq_nr; /* Log line unique sequence number */
+
+ spinlock_t free_lock;
+ spinlock_t gc_lock;
+};
+
+struct pblk_line_meta {
+ unsigned int smeta_len; /* Total length for smeta */
+ unsigned int smeta_sec; /* Sectors needed for smeta*/
+ unsigned int emeta_len; /* Total length for emeta */
+ unsigned int emeta_sec; /* Sectors needed for emeta*/
+ unsigned int emeta_bb; /* Boundary for bb that affects emeta */
+ unsigned int sec_bitmap_len; /* Length for sector bitmap in line */
+ unsigned int blk_bitmap_len; /* Length for block bitmap in line */
+ unsigned int lun_bitmap_len; /* Length for lun bitmap in line */
+
+ unsigned int blk_per_line; /* Number of blocks in a full line */
+ unsigned int sec_per_line; /* Number of sectors in a line */
+ unsigned int min_blk_line; /* Min. number of good blocks in line */
+
+ unsigned int mid_thrs; /* Threshold for GC mid list */
+ unsigned int high_thrs; /* Threshold for GC high list */
+};
+
+struct pblk_addr_format {
+ u64 ch_mask;
+ u64 lun_mask;
+ u64 pln_mask;
+ u64 blk_mask;
+ u64 pg_mask;
+ u64 sec_mask;
+ u8 ch_offset;
+ u8 lun_offset;
+ u8 pln_offset;
+ u8 blk_offset;
+ u8 pg_offset;
+ u8 sec_offset;
+};
+
+struct pblk {
+ struct nvm_tgt_dev *dev;
+ struct gendisk *disk;
+
+ struct kobject kobj;
+
+ struct pblk_lun *luns;
+
+ struct pblk_line *lines; /* Line array */
+ struct pblk_line_mgmt l_mg; /* Line management */
+ struct pblk_line_meta lm; /* Line metadata */
+
+ int ppaf_bitsize;
+ struct pblk_addr_format ppaf;
+
+ struct pblk_rb rwb;
+
+ int min_write_pgs; /* Minimum amount of pages required by controller */
+ int max_write_pgs; /* Maximum amount of pages supported by controller */
+ int pgs_in_buffer; /* Number of pages that need to be held in buffer to
+ * guarantee successful reads.
+ */
+
+ sector_t capacity; /* Device capacity when bad blocks are subtracted */
+ int over_pct; /* Percentage of device used for over-provisioning */
+
+ /* pblk provisioning values. Used by rate limiter */
+ struct pblk_rl rl;
+
+ struct semaphore erase_sem;
+
+ unsigned char instance_uuid[16];
+#ifdef CONFIG_NVM_DEBUG
+ /* All debug counters apply to 4kb sector I/Os */
+ atomic_long_t inflight_writes; /* Inflight writes (user and gc) */
+ atomic_long_t padded_writes; /* Sectors padded due to flush/fua */
+ atomic_long_t padded_wb; /* Sectors padded in write buffer */
+ atomic_long_t nr_flush; /* Number of flush/fua I/O */
+ atomic_long_t req_writes; /* Sectors stored on write buffer */
+ atomic_long_t sub_writes; /* Sectors submitted from buffer */
+ atomic_long_t sync_writes; /* Sectors synced to media */
+ atomic_long_t compl_writes; /* Sectors completed in write bio */
+ atomic_long_t inflight_reads; /* Inflight sector read requests */
+ atomic_long_t sync_reads; /* Completed sector read requests */
+ atomic_long_t recov_writes; /* Sectors submitted from recovery */
+ atomic_long_t recov_gc_writes; /* Sectors submitted from write GC */
+ atomic_long_t recov_gc_reads; /* Sectors submitted from read GC */
+#endif
+
+ spinlock_t lock;
+
+ atomic_long_t read_failed;
+ atomic_long_t read_empty;
+ atomic_long_t read_high_ecc;
+ atomic_long_t read_failed_gc;
+ atomic_long_t write_failed;
+ atomic_long_t erase_failed;
+
+ struct task_struct *writer_ts;
+
+ /* Simple translation map of logical addresses to physical addresses.
+ * The logical addresses is known by the host system, while the physical
+ * addresses are used when writing to the disk block device.
+ */
+ unsigned char *trans_map;
+ spinlock_t trans_lock;
+
+ struct list_head compl_list;
+
+ mempool_t *page_pool;
+ mempool_t *line_ws_pool;
+ mempool_t *rec_pool;
+ mempool_t *r_rq_pool;
+ mempool_t *w_rq_pool;
+ mempool_t *line_meta_pool;
+
+ struct workqueue_struct *kw_wq;
+ struct timer_list wtimer;
+
+ struct pblk_gc gc;
+};
+
+struct pblk_line_ws {
+ struct pblk *pblk;
+ struct pblk_line *line;
+ void *priv;
+ struct work_struct ws;
+};
+
+#define pblk_r_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_r_ctx))
+#define pblk_w_rq_size (sizeof(struct nvm_rq) + sizeof(struct pblk_c_ctx))
+
+/*
+ * pblk ring buffer operations
+ */
+int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
+ unsigned int power_size, unsigned int power_seg_sz);
+unsigned int pblk_rb_calculate_size(unsigned int nr_entries);
+void *pblk_rb_entries_ref(struct pblk_rb *rb);
+int pblk_rb_may_write_user(struct pblk_rb *rb, struct bio *bio,
+ unsigned int nr_entries, unsigned int *pos);
+int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries,
+ unsigned int *pos);
+void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, unsigned int pos);
+void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
+ struct pblk_w_ctx w_ctx, struct pblk_line *gc_line,
+ unsigned int pos);
+struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos);
+
+void pblk_rb_sync_l2p(struct pblk_rb *rb);
+unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct bio *bio,
+ struct pblk_c_ctx *c_ctx,
+ unsigned int pos,
+ unsigned int nr_entries,
+ unsigned int count);
+unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
+ struct list_head *list,
+ unsigned int max);
+int pblk_rb_copy_to_bio(struct pblk_rb *rb, struct bio *bio, sector_t lba,
+ u64 pos, int bio_iter);
+unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int entries);
+
+unsigned int pblk_rb_sync_init(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
+struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
+ struct ppa_addr *ppa);
+void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
+unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
+
+unsigned int pblk_rb_read_count(struct pblk_rb *rb);
+unsigned int pblk_rb_wrap_pos(struct pblk_rb *rb, unsigned int pos);
+
+int pblk_rb_tear_down_check(struct pblk_rb *rb);
+int pblk_rb_pos_oob(struct pblk_rb *rb, u64 pos);
+void pblk_rb_data_free(struct pblk_rb *rb);
+ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf);
+
+/*
+ * pblk core
+ */
+struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw);
+int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ struct pblk_c_ctx *c_ctx);
+void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw);
+void pblk_flush_writer(struct pblk *pblk);
+struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba);
+void pblk_discard(struct pblk *pblk, struct bio *bio);
+void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd);
+int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd);
+struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
+ unsigned int nr_secs, unsigned int len,
+ gfp_t gfp_mask);
+struct pblk_line *pblk_line_get(struct pblk *pblk);
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
+int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
+struct pblk_line *pblk_line_get_data(struct pblk *pblk);
+struct pblk_line *pblk_line_get_data_next(struct pblk *pblk);
+int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_is_full(struct pblk_line *line);
+void pblk_line_free(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_close_ws(struct work_struct *work);
+void pblk_line_close(struct pblk *pblk, struct pblk_line *line);
+void pblk_line_mark_bb(struct work_struct *work);
+void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv,
+ void (*work)(struct work_struct *));
+u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line);
+int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa);
+void pblk_line_put(struct kref *ref);
+struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line);
+u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
+int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
+ unsigned long secs_to_flush);
+void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap);
+void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
+ unsigned long *lun_bitmap);
+void pblk_end_bio_sync(struct bio *bio);
+void pblk_end_io_sync(struct nvm_rq *rqd);
+int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags,
+ int nr_pages);
+void pblk_map_pad_invalidate(struct pblk *pblk, struct pblk_line *line,
+ u64 paddr);
+void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off,
+ int nr_pages);
+void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa);
+void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa);
+void pblk_update_map_cache(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa);
+void pblk_update_map_dev(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa, struct ppa_addr entry_line);
+int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa,
+ struct pblk_line *gc_line);
+void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
+ u64 *lba_list, int nr_secs);
+void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
+ sector_t blba, int nr_secs);
+
+/*
+ * pblk user I/O write path
+ */
+int pblk_write_to_cache(struct pblk *pblk, struct bio *bio,
+ unsigned long flags);
+int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list,
+ unsigned int nr_entries, unsigned int nr_rec_entries,
+ struct pblk_line *gc_line, unsigned long flags);
+
+/*
+ * pblk map
+ */
+void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int sentry, unsigned long *lun_bitmap,
+ unsigned int valid_secs, struct ppa_addr *erase_ppa);
+void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+ unsigned long *lun_bitmap, unsigned int valid_secs,
+ unsigned int off);
+
+/*
+ * pblk write thread
+ */
+int pblk_write_ts(void *data);
+void pblk_write_timer_fn(unsigned long data);
+void pblk_write_should_kick(struct pblk *pblk);
+
+/*
+ * pblk read path
+ */
+int pblk_submit_read(struct pblk *pblk, struct bio *bio);
+int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data,
+ unsigned int nr_secs, unsigned int *secs_to_gc,
+ struct pblk_line *line);
+/*
+ * pblk recovery
+ */
+void pblk_submit_rec(struct work_struct *work);
+struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
+void pblk_recov_pad(struct pblk *pblk);
+__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
+int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
+ struct pblk_rec_ctx *recovery, u64 *comp_bits,
+ unsigned int comp);
+
+/*
+ * pblk gc
+ */
+#define PBLK_GC_TRIES 3
+
+int pblk_gc_init(struct pblk *pblk);
+void pblk_gc_exit(struct pblk *pblk);
+void pblk_gc_should_start(struct pblk *pblk);
+void pblk_gc_should_stop(struct pblk *pblk);
+int pblk_gc_status(struct pblk *pblk);
+void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled,
+ int *gc_active);
+void pblk_gc_sysfs_force(struct pblk *pblk, int force);
+
+/*
+ * pblk rate limiter
+ */
+void pblk_rl_init(struct pblk_rl *rl, int budget);
+void pblk_rl_free(struct pblk_rl *rl);
+int pblk_rl_gc_thrs(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
+int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
+void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
+void pblk_rl_set_gc_rsc(struct pblk_rl *rl, int rsv);
+int pblk_rl_sysfs_rate_show(struct pblk_rl *rl);
+void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
+
+/*
+ * pblk sysfs
+ */
+int pblk_sysfs_init(struct gendisk *tdisk);
+void pblk_sysfs_exit(struct gendisk *tdisk);
+
+static inline void *pblk_malloc(size_t size, int type, gfp_t flags)
+{
+ if (type == PBLK_KMALLOC_META)
+ return kmalloc(size, flags);
+ return vmalloc(size);
+}
+
+static inline void pblk_mfree(void *ptr, int type)
+{
+ if (type == PBLK_KMALLOC_META)
+ kfree(ptr);
+ else
+ vfree(ptr);
+}
+
+static inline struct nvm_rq *nvm_rq_from_c_ctx(void *c_ctx)
+{
+ return c_ctx - sizeof(struct nvm_rq);
+}
+
+static inline void *pblk_line_emeta_to_lbas(struct line_emeta *emeta)
+{
+ return (emeta) + 1;
+}
+
+#define NVM_MEM_PAGE_WRITE (8)
+
+static inline int pblk_pad_distance(struct pblk *pblk)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+
+ return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+}
+
+static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
+{
+ return p.g.blk;
+}
+
+static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
+{
+ return p.g.blk;
+}
+
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+ return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+/* A block within a line corresponds to the lun */
+static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+{
+ return p.g.lun * geo->nr_chnls + p.g.ch;
+}
+
+static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
+{
+ struct ppa_addr ppa64;
+
+ ppa64.ppa = 0;
+
+ if (ppa32 == -1) {
+ ppa64.ppa = ADDR_EMPTY;
+ } else if (ppa32 & (1U << 31)) {
+ ppa64.c.line = ppa32 & ((~0U) >> 1);
+ ppa64.c.is_cached = 1;
+ } else {
+ ppa64.g.blk = (ppa32 & pblk->ppaf.blk_mask) >>
+ pblk->ppaf.blk_offset;
+ ppa64.g.pg = (ppa32 & pblk->ppaf.pg_mask) >>
+ pblk->ppaf.pg_offset;
+ ppa64.g.lun = (ppa32 & pblk->ppaf.lun_mask) >>
+ pblk->ppaf.lun_offset;
+ ppa64.g.ch = (ppa32 & pblk->ppaf.ch_mask) >>
+ pblk->ppaf.ch_offset;
+ ppa64.g.pl = (ppa32 & pblk->ppaf.pln_mask) >>
+ pblk->ppaf.pln_offset;
+ ppa64.g.sec = (ppa32 & pblk->ppaf.sec_mask) >>
+ pblk->ppaf.sec_offset;
+ }
+
+ return ppa64;
+}
+
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
+ sector_t lba)
+{
+ struct ppa_addr ppa;
+
+ if (pblk->ppaf_bitsize < 32) {
+ u32 *map = (u32 *)pblk->trans_map;
+
+ ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
+ } else {
+ struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
+
+ ppa = map[lba];
+ }
+
+ return ppa;
+}
+
+static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
+{
+ u32 ppa32 = 0;
+
+ if (ppa64.ppa == ADDR_EMPTY) {
+ ppa32 = ~0U;
+ } else if (ppa64.c.is_cached) {
+ ppa32 |= ppa64.c.line;
+ ppa32 |= 1U << 31;
+ } else {
+ ppa32 |= ppa64.g.blk << pblk->ppaf.blk_offset;
+ ppa32 |= ppa64.g.pg << pblk->ppaf.pg_offset;
+ ppa32 |= ppa64.g.lun << pblk->ppaf.lun_offset;
+ ppa32 |= ppa64.g.ch << pblk->ppaf.ch_offset;
+ ppa32 |= ppa64.g.pl << pblk->ppaf.pln_offset;
+ ppa32 |= ppa64.g.sec << pblk->ppaf.sec_offset;
+ }
+
+ return ppa32;
+}
+
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa)
+{
+ if (pblk->ppaf_bitsize < 32) {
+ u32 *map = (u32 *)pblk->trans_map;
+
+ map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+ } else {
+ u64 *map = (u64 *)pblk->trans_map;
+
+ map[lba] = ppa.ppa;
+ }
+}
+
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+ struct ppa_addr p)
+{
+ u64 paddr;
+
+ paddr = 0;
+ paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
+ paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+ paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
+ paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
+ paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+
+ return paddr;
+}
+
+static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
+{
+ return (ppa_addr.ppa == ADDR_EMPTY);
+}
+
+static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
+{
+ ppa_addr->ppa = ADDR_EMPTY;
+}
+
+static inline int pblk_addr_in_cache(struct ppa_addr ppa)
+{
+ return (ppa.ppa != ADDR_EMPTY && ppa.c.is_cached);
+}
+
+static inline int pblk_addr_to_cacheline(struct ppa_addr ppa)
+{
+ return ppa.c.line;
+}
+
+static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
+{
+ struct ppa_addr p;
+
+ p.c.line = addr;
+ p.c.is_cached = 1;
+
+ return p;
+}
+
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+ u64 line_id)
+{
+ struct ppa_addr ppa;
+
+ ppa.ppa = 0;
+ ppa.g.blk = line_id;
+ ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
+ ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
+ ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
+ ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
+ ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+
+ return ppa;
+}
+
+static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
+ u64 line_id)
+{
+ struct ppa_addr ppa;
+
+ ppa = addr_to_gen_ppa(pblk, paddr, line_id);
+
+ return ppa;
+}
+
+static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
+ struct line_smeta *smeta)
+{
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)smeta + sizeof(crc),
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline u32 pblk_calc_smeta_crc(struct pblk *pblk,
+ struct line_smeta *smeta)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)smeta +
+ sizeof(struct line_header) + sizeof(crc),
+ lm->smeta_len -
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline u32 pblk_calc_emeta_crc(struct pblk *pblk,
+ struct line_emeta *emeta)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+ u32 crc = ~(u32)0;
+
+ crc = crc32_le(crc, (unsigned char *)emeta +
+ sizeof(struct line_header) + sizeof(crc),
+ lm->emeta_len -
+ sizeof(struct line_header) - sizeof(crc));
+
+ return crc;
+}
+
+static inline int pblk_set_progr_mode(struct pblk *pblk, int type)
+{
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ int flags;
+
+ flags = geo->plane_mode >> 1;
+
+ if (type == WRITE)
+ flags |= NVM_IO_SCRAMBLE_ENABLE;
+
+ return flags;
+}
+
+static inline int pblk_set_read_mode(struct pblk *pblk)
+{
+ return NVM_IO_SNGL_ACCESS | NVM_IO_SUSPEND | NVM_IO_SCRAMBLE_ENABLE;
+}
+
+#ifdef CONFIG_NVM_DEBUG
+static inline void print_ppa(struct ppa_addr *p, char *msg, int error)
+{
+ if (p->c.is_cached) {
+ pr_err("ppa: (%s: %x) cache line: %llu\n",
+ msg, error, (u64)p->c.line);
+ } else {
+ pr_err("ppa: (%s: %x):ch:%d,lun:%d,blk:%d,pg:%d,pl:%d,sec:%d\n",
+ msg, error,
+ p->g.ch, p->g.lun, p->g.blk,
+ p->g.pg, p->g.pl, p->g.sec);
+ }
+}
+
+static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd,
+ int error)
+{
+ int bit = -1;
+
+ if (rqd->nr_ppas == 1) {
+ print_ppa(&rqd->ppa_addr, "rqd", error);
+ return;
+ }
+
+ while ((bit = find_next_bit((void *)&rqd->ppa_status, rqd->nr_ppas,
+ bit + 1)) < rqd->nr_ppas) {
+ print_ppa(&rqd->ppa_list[bit], "rqd", error);
+ }
+
+ pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status);
+}
+#endif
+
+static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
+ struct ppa_addr *ppas, int nr_ppas)
+{
+ struct nvm_geo *geo = &tgt_dev->geo;
+ struct ppa_addr *ppa;
+ int i;
+
+ for (i = 0; i < nr_ppas; i++) {
+ ppa = &ppas[i];
+
+ if (!ppa->c.is_cached &&
+ ppa->g.ch < geo->nr_chnls &&
+ ppa->g.lun < geo->luns_per_chnl &&
+ ppa->g.pl < geo->nr_planes &&
+ ppa->g.blk < geo->blks_per_lun &&
+ ppa->g.pg < geo->pgs_per_blk &&
+ ppa->g.sec < geo->sec_per_pg)
+ continue;
+
+#ifdef CONFIG_NVM_DEBUG
+ print_ppa(ppa, "boundary", i);
+#endif
+ return 1;
+ }
+ return 0;
+}
+
+static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr)
+{
+ struct pblk_line_meta *lm = &pblk->lm;
+
+ if (paddr > lm->sec_per_line)
+ return 1;
+
+ return 0;
+}
+
+static inline unsigned int pblk_get_bi_idx(struct bio *bio)
+{
+ return bio->bi_iter.bi_idx;
+}
+
+static inline sector_t pblk_get_lba(struct bio *bio)
+{
+ return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
+}
+
+static inline unsigned int pblk_get_secs(struct bio *bio)
+{
+ return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
+}
+
+static inline sector_t pblk_get_sector(sector_t lba)
+{
+ return lba * NR_PHY_IN_LOG;
+}
+
+static inline void pblk_setup_uuid(struct pblk *pblk)
+{
+ uuid_le uuid;
+
+ uuid_le_gen(&uuid);
+ memcpy(pblk->instance_uuid, uuid.b, 16);
+}
+#endif /* PBLK_H_ */
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
index e00b1d7b976f0..cf0e28a0ff61d 100644
--- a/drivers/lightnvm/rrpc.c
+++ b/drivers/lightnvm/rrpc.c
@@ -318,10 +318,6 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
}
page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
- if (!page) {
- bio_put(bio);
- return -ENOMEM;
- }
while ((slot = find_first_zero_bit(rblk->invalid_pages,
nr_sec_per_blk)) < nr_sec_per_blk) {
@@ -414,7 +410,6 @@ static void rrpc_block_gc(struct work_struct *work)
struct rrpc *rrpc = gcb->rrpc;
struct rrpc_block *rblk = gcb->rblk;
struct rrpc_lun *rlun = rblk->rlun;
- struct nvm_tgt_dev *dev = rrpc->dev;
struct ppa_addr ppa;
mempool_free(gcb, rrpc->gcb_pool);
@@ -430,7 +425,7 @@ static void rrpc_block_gc(struct work_struct *work)
ppa.g.lun = rlun->bppa.g.lun;
ppa.g.blk = rblk->id;
- if (nvm_erase_blk(dev, &ppa, 0))
+ if (nvm_erase_sync(rrpc->dev, &ppa, 1))
goto put_back;
rrpc_put_blk(rrpc, rblk);
@@ -822,7 +817,7 @@ static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
for (i = 0; i < npages; i++) {
/* We assume that mapping occurs at 4KB granularity */
- BUG_ON(!(laddr + i >= 0 && laddr + i < rrpc->nr_sects));
+ BUG_ON(!(laddr + i < rrpc->nr_sects));
gp = &rrpc->trans_map[laddr + i];
if (gp->rblk) {
@@ -851,7 +846,7 @@ static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
return NVM_IO_REQUEUE;
- BUG_ON(!(laddr >= 0 && laddr < rrpc->nr_sects));
+ BUG_ON(!(laddr < rrpc->nr_sects));
gp = &rrpc->trans_map[laddr];
if (gp->rblk) {
@@ -1007,11 +1002,6 @@ static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
}
rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
- if (!rqd) {
- pr_err_ratelimited("rrpc: not able to queue bio.");
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
memset(rqd, 0, sizeof(struct nvm_rq));
err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
@@ -1275,8 +1265,10 @@ static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
}
nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
- if (nr_blks < 0)
- return nr_blks;
+ if (nr_blks < 0) {
+ ret = nr_blks;
+ goto out;
+ }
for (i = 0; i < nr_blks; i++) {
if (blks[i] == NVM_BLK_T_FREE)
@@ -1514,7 +1506,8 @@ err:
static struct nvm_tgt_type tt_rrpc;
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk)
+static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
+ int flags)
{
struct request_queue *bqueue = dev->q;
struct request_queue *tqueue = tdisk->queue;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 9c689b34e6e79..975922c8f2314 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2773,7 +2773,6 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
- ti->discard_zeroes_data_unsupported = true;
ti->split_discard_bios = false;
cache->features = ca->features;
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 136fda3ff9e55..fea5bd52ada8f 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -132,6 +132,7 @@ void dm_init_md_queue(struct mapped_device *md);
void dm_init_normal_md_queue(struct mapped_device *md);
int md_in_flight(struct mapped_device *md);
void disable_write_same(struct mapped_device *md);
+void disable_write_zeroes(struct mapped_device *md);
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 389a3637ffcc6..ef1d836bd81b6 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -2030,7 +2030,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
wake_up_process(cc->write_thread);
ti->num_flush_bios = 1;
- ti->discard_zeroes_data_unsupported = true;
return 0;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 03940bf36f6cc..3702e502466d3 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -312,9 +312,12 @@ static void do_region(int op, int op_flags, unsigned region,
*/
if (op == REQ_OP_DISCARD)
special_cmd_max_sectors = q->limits.max_discard_sectors;
+ else if (op == REQ_OP_WRITE_ZEROES)
+ special_cmd_max_sectors = q->limits.max_write_zeroes_sectors;
else if (op == REQ_OP_WRITE_SAME)
special_cmd_max_sectors = q->limits.max_write_same_sectors;
- if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_SAME) &&
+ if ((op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES ||
+ op == REQ_OP_WRITE_SAME) &&
special_cmd_max_sectors == 0) {
dec_count(io, region, -EOPNOTSUPP);
return;
@@ -328,11 +331,18 @@ static void do_region(int op, int op_flags, unsigned region,
/*
* Allocate a suitably sized-bio.
*/
- if ((op == REQ_OP_DISCARD) || (op == REQ_OP_WRITE_SAME))
+ switch (op) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
+ num_bvecs = 0;
+ break;
+ case REQ_OP_WRITE_SAME:
num_bvecs = 1;
- else
+ break;
+ default:
num_bvecs = min_t(int, BIO_MAX_PAGES,
dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
+ }
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_iter.bi_sector = where->sector + (where->count - remaining);
@@ -341,7 +351,7 @@ static void do_region(int op, int op_flags, unsigned region,
bio_set_op_attrs(bio, op, op_flags);
store_io_and_region_in_bio(bio, io, region);
- if (op == REQ_OP_DISCARD) {
+ if (op == REQ_OP_DISCARD || op == REQ_OP_WRITE_ZEROES) {
num_sectors = min_t(sector_t, special_cmd_max_sectors, remaining);
bio->bi_iter.bi_size = num_sectors << SECTOR_SHIFT;
remaining -= num_sectors;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 9e9d04cb7d51f..f85846741d508 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -733,11 +733,11 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->pages = &zero_page_list;
/*
- * Use WRITE SAME to optimize zeroing if all dests support it.
+ * Use WRITE ZEROES to optimize zeroing if all dests support it.
*/
- job->rw = REQ_OP_WRITE_SAME;
+ job->rw = REQ_OP_WRITE_ZEROES;
for (i = 0; i < job->num_dests; i++)
- if (!bdev_write_same(job->dests[i].bdev)) {
+ if (!bdev_write_zeroes_sectors(job->dests[i].bdev)) {
job->rw = WRITE;
break;
}
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 4788b0b989a9b..e17fd44ceef53 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -59,6 +59,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
ti->private = lc;
return 0;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 7f223dbed49f6..2950b145443d7 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1103,6 +1103,7 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->num_write_same_bios = 1;
+ ti->num_write_zeroes_bios = 1;
if (m->queue_mode == DM_TYPE_BIO_BASED)
ti->per_io_data_size = multipath_per_bio_data_size();
else
@@ -1491,7 +1492,7 @@ static int do_end_io(struct multipath *m, struct request *clone,
*/
int r = DM_ENDIO_REQUEUE;
- if (!error && !clone->errors)
+ if (!error)
return 0; /* I/O complete */
if (noretry_error(error))
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 1e217ba84d090..2dae3e5b851ce 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2813,7 +2813,9 @@ static void configure_discard_support(struct raid_set *rs)
/* Assume discards not supported until after checks below. */
ti->discards_supported = false;
- /* RAID level 4,5,6 require discard_zeroes_data for data integrity! */
+ /*
+ * XXX: RAID level 4,5,6 require zeroing for safety.
+ */
raid456 = (rs->md.level == 4 || rs->md.level == 5 || rs->md.level == 6);
for (i = 0; i < rs->raid_disks; i++) {
@@ -2827,8 +2829,6 @@ static void configure_discard_support(struct raid_set *rs)
return;
if (raid456) {
- if (!q->limits.discard_zeroes_data)
- return;
if (!devices_handle_discard_safely) {
DMERR("raid456 discard support disabled due to discard_zeroes_data uncertainty.");
DMERR("Set dm-raid.devices_handle_discard_safely=Y to override.");
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 2ddc2d20e62d1..a95cbb80fb344 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1124,7 +1124,6 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = 1;
ti->num_discard_bios = 1;
ti->per_io_data_size = sizeof(struct dm_raid1_bio_record);
- ti->discard_zeroes_data_unsupported = true;
ms->kmirrord_wq = alloc_workqueue("kmirrord", WQ_MEM_RECLAIM, 0);
if (!ms->kmirrord_wq) {
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 0b081d170087d..bff7e3bdb4ed1 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -298,9 +298,14 @@ static void dm_done(struct request *clone, int error, bool mapped)
r = rq_end_io(tio->ti, clone, error, &tio->info);
}
- if (unlikely(r == -EREMOTEIO && (req_op(clone) == REQ_OP_WRITE_SAME) &&
- !clone->q->limits.max_write_same_sectors))
- disable_write_same(tio->md);
+ if (unlikely(r == -EREMOTEIO)) {
+ if (req_op(clone) == REQ_OP_WRITE_SAME &&
+ !clone->q->limits.max_write_same_sectors)
+ disable_write_same(tio->md);
+ if (req_op(clone) == REQ_OP_WRITE_ZEROES &&
+ !clone->q->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(tio->md);
+ }
if (r <= 0)
/* The target wants to complete the I/O */
@@ -358,7 +363,7 @@ static void dm_complete_request(struct request *rq, int error)
if (!rq->q->mq_ops)
blk_complete_request(rq);
else
- blk_mq_complete_request(rq, error);
+ blk_mq_complete_request(rq);
}
/*
@@ -762,7 +767,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_MQ_RQ_QUEUE_OK;
}
-static struct blk_mq_ops dm_mq_ops = {
+static const struct blk_mq_ops dm_mq_ops = {
.queue_rq = dm_mq_queue_rq,
.complete = dm_softirq_done,
.init_request = dm_mq_init_request,
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 28193a57bf471..5ef49c121d995 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -169,6 +169,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->num_flush_bios = stripes;
ti->num_discard_bios = stripes;
ti->num_write_same_bios = stripes;
+ ti->num_write_zeroes_bios = stripes;
sc->chunk_size = chunk_size;
if (chunk_size & (chunk_size - 1))
@@ -293,6 +294,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
if (unlikely(bio_op(bio) == REQ_OP_DISCARD) ||
+ unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES) ||
unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) {
target_bio_nr = dm_bio_get_target_bio_nr(bio);
BUG_ON(target_bio_nr >= sc->stripes);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ad16d9c9d5aa..958275aca0084 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1449,22 +1449,6 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned long flush)
return false;
}
-static bool dm_table_discard_zeroes_data(struct dm_table *t)
-{
- struct dm_target *ti;
- unsigned i = 0;
-
- /* Ensure that all targets supports discard_zeroes_data. */
- while (i < dm_table_get_num_targets(t)) {
- ti = dm_table_get_target(t, i++);
-
- if (ti->discard_zeroes_data_unsupported)
- return false;
- }
-
- return true;
-}
-
static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1533,6 +1517,34 @@ static bool dm_table_supports_write_same(struct dm_table *t)
return true;
}
+static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
+{
+ struct request_queue *q = bdev_get_queue(dev->bdev);
+
+ return q && !q->limits.max_write_zeroes_sectors;
+}
+
+static bool dm_table_supports_write_zeroes(struct dm_table *t)
+{
+ struct dm_target *ti;
+ unsigned i = 0;
+
+ while (i < dm_table_get_num_targets(t)) {
+ ti = dm_table_get_target(t, i++);
+
+ if (!ti->num_write_zeroes_bios)
+ return false;
+
+ if (!ti->type->iterate_devices ||
+ ti->type->iterate_devices(ti, device_not_write_zeroes_capable, NULL))
+ return false;
+ }
+
+ return true;
+}
+
+
static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
@@ -1592,9 +1604,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
blk_queue_write_cache(q, wc, fua);
- if (!dm_table_discard_zeroes_data(t))
- q->limits.discard_zeroes_data = 0;
-
/* Ensure that all underlying devices are non-rotational. */
if (dm_table_all_devices_attribute(t, device_is_nonrot))
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
@@ -1603,6 +1612,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
if (!dm_table_supports_write_same(t))
q->limits.max_write_same_sectors = 0;
+ if (!dm_table_supports_write_zeroes(t))
+ q->limits.max_write_zeroes_sectors = 0;
if (dm_table_all_devices_attribute(t, queue_supports_sg_merge))
queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 2b266a2b5035b..a5f1916f621a9 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3263,7 +3263,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
* them down to the data device. The thin device's discard
* processing will cause mappings to be removed from the btree.
*/
- ti->discard_zeroes_data_unsupported = true;
if (pf.discard_enabled && pf.discard_passdown) {
ti->num_discard_bios = 1;
@@ -4119,7 +4118,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
/* In case the pool supports discards, pass them on. */
- ti->discard_zeroes_data_unsupported = true;
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index dfb75979e4555..8bf397729bbd2 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -810,7 +810,6 @@ static void dec_pending(struct dm_io *io, int error)
queue_io(md, bio);
} else {
/* done with normal IO or empty flush */
- trace_block_bio_complete(md->queue, bio, io_error);
bio->bi_error = io_error;
bio_endio(bio);
}
@@ -825,6 +824,14 @@ void disable_write_same(struct mapped_device *md)
limits->max_write_same_sectors = 0;
}
+void disable_write_zeroes(struct mapped_device *md)
+{
+ struct queue_limits *limits = dm_get_queue_limits(md);
+
+ /* device doesn't really support WRITE ZEROES, disable it */
+ limits->max_write_zeroes_sectors = 0;
+}
+
static void clone_endio(struct bio *bio)
{
int error = bio->bi_error;
@@ -851,9 +858,14 @@ static void clone_endio(struct bio *bio)
}
}
- if (unlikely(r == -EREMOTEIO && (bio_op(bio) == REQ_OP_WRITE_SAME) &&
- !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
- disable_write_same(md);
+ if (unlikely(r == -EREMOTEIO)) {
+ if (bio_op(bio) == REQ_OP_WRITE_SAME &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
+ disable_write_same(md);
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ disable_write_zeroes(md);
+ }
free_tio(tio);
dec_pending(io, error);
@@ -1202,6 +1214,11 @@ static unsigned get_num_write_same_bios(struct dm_target *ti)
return ti->num_write_same_bios;
}
+static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
+{
+ return ti->num_write_zeroes_bios;
+}
+
typedef bool (*is_split_required_fn)(struct dm_target *ti);
static bool is_split_required_for_discard(struct dm_target *ti)
@@ -1256,6 +1273,11 @@ static int __send_write_same(struct clone_info *ci)
return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
}
+static int __send_write_zeroes(struct clone_info *ci)
+{
+ return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
+}
+
/*
* Select the correct strategy for processing a non-flush bio.
*/
@@ -1270,6 +1292,8 @@ static int __split_and_process_non_flush(struct clone_info *ci)
return __send_discard(ci);
else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
return __send_write_same(ci);
+ else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
+ return __send_write_zeroes(ci);
ti = dm_table_find_target(ci->map, ci->sector);
if (!dm_target_is_valid(ti))
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 3e38e0207a3eb..377a8a3672e3d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -293,6 +293,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
split, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_writesame(mddev, split);
+ mddev_check_write_zeroes(mddev, split);
generic_make_request(split);
}
} while (split != bio);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index dde8ecb760c87..1e76d64ce1803 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -709,4 +709,11 @@ static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio)
!bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
mddev->queue->limits.max_write_same_sectors = 0;
}
+
+static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio)
+{
+ if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
+ mddev->queue->limits.max_write_zeroes_sectors = 0;
+}
#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 79a12b59250bb..e95d521d93e9b 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -139,6 +139,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio)
mp_bh->bio.bi_end_io = multipath_end_request;
mp_bh->bio.bi_private = mp_bh;
mddev_check_writesame(mddev, &mp_bh->bio);
+ mddev_check_write_zeroes(mddev, &mp_bh->bio);
generic_make_request(&mp_bh->bio);
return;
}
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 93347ca7c7a61..ce7a6a56cf738 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -383,6 +383,7 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
@@ -504,6 +505,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
split, disk_devt(mddev->gendisk),
bio_sector);
mddev_check_writesame(mddev, split);
+ mddev_check_write_zeroes(mddev, split);
generic_make_request(split);
}
} while (split != bio);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a34f58772022c..b59cc100320af 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3177,8 +3177,10 @@ static int raid1_run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- if (mddev->queue)
+ if (mddev->queue) {
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
+ }
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e89a8d78a9ed5..28ec3a93aceea 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3749,6 +3749,7 @@ static int raid10_run(struct mddev *mddev)
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, chunk_size);
if (conf->geo.raid_disks % conf->geo.near_copies)
blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed5cd705b985f..2efdb0d674607 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5031,8 +5031,6 @@ static void raid5_align_endio(struct bio *bi)
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
- trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
- raid_bi, 0);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
@@ -7229,7 +7227,6 @@ static int raid5_run(struct mddev *mddev)
if (mddev->queue) {
int chunk_size;
- bool discard_supported = true;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
@@ -7265,48 +7262,32 @@ static int raid5_run(struct mddev *mddev)
blk_queue_max_discard_sectors(mddev->queue,
0xfffe * STRIPE_SECTORS);
- /*
- * unaligned part of discard request will be ignored, so can't
- * guarantee discard_zeroes_data
- */
- mddev->queue->limits.discard_zeroes_data = 0;
-
blk_queue_max_write_same_sectors(mddev->queue, 0);
+ blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
- /*
- * discard_zeroes_data is required, otherwise data
- * could be lost. Consider a scenario: discard a stripe
- * (the stripe could be inconsistent if
- * discard_zeroes_data is 0); write one disk of the
- * stripe (the stripe could be inconsistent again
- * depending on which disks are used to calculate
- * parity); the disk is broken; The stripe data of this
- * disk is lost.
- */
- if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
- !bdev_get_queue(rdev->bdev)->
- limits.discard_zeroes_data)
- discard_supported = false;
- /* Unfortunately, discard_zeroes_data is not currently
- * a guarantee - just a hint. So we only allow DISCARD
- * if the sysadmin has confirmed that only safe devices
- * are in use by setting a module parameter.
- */
- if (!devices_handle_discard_safely) {
- if (discard_supported) {
- pr_info("md/raid456: discard support disabled due to uncertainty.\n");
- pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
- }
- discard_supported = false;
- }
}
- if (discard_supported &&
+ /*
+ * zeroing is required, otherwise data
+ * could be lost. Consider a scenario: discard a stripe
+ * (the stripe could be inconsistent if
+ * discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again
+ * depending on which disks are used to calculate
+ * parity); the disk is broken; The stripe data of this
+ * disk is lost.
+ *
+ * We only allow DISCARD if the sysadmin has confirmed that
+ * only safe devices are in use by setting a module parameter.
+ * A better idea might be to turn DISCARD into WRITE_ZEROES
+ * requests, as that is required to be safe.
+ */
+ if (devices_handle_discard_safely &&
mddev->queue->limits.max_discard_sectors >= (stripe >> 9) &&
mddev->queue->limits.discard_granularity >= stripe)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 493eb10ce5804..4c54ad34e17a1 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -167,8 +167,6 @@ static void mmc_queue_setup_discard(struct request_queue *q,
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
blk_queue_max_discard_sectors(q, max_discard);
- if (card->erased_byte == 0 && !mmc_can_discard(card))
- q->limits.discard_zeroes_data = 1;
q->limits.discard_granularity = card->pref_erase << 9;
/* granularity must not be greater than max. discard */
if (card->pref_erase > max_discard)
diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 66a9dedd10620..1517da3ddd7d0 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -46,7 +46,7 @@
#include "mtdcore.h"
-static struct backing_dev_info *mtd_bdi;
+struct backing_dev_info *mtd_bdi;
#ifdef CONFIG_PM_SLEEP
@@ -496,11 +496,9 @@ int add_mtd_device(struct mtd_info *mtd)
* mtd_device_parse_register() multiple times on the same master MTD,
* especially with CONFIG_MTD_PARTITIONED_MASTER=y.
*/
- if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
+ if (WARN_ONCE(mtd->dev.type, "MTD already registered\n"))
return -EEXIST;
- mtd->backing_dev_info = mtd_bdi;
-
BUG_ON(mtd->writesize == 0);
mutex_lock(&mtd_table_mutex);
@@ -1775,13 +1773,18 @@ static struct backing_dev_info * __init mtd_bdi_init(char *name)
struct backing_dev_info *bdi;
int ret;
- bdi = kzalloc(sizeof(*bdi), GFP_KERNEL);
+ bdi = bdi_alloc(GFP_KERNEL);
if (!bdi)
return ERR_PTR(-ENOMEM);
- ret = bdi_setup_and_register(bdi, name);
+ bdi->name = name;
+ /*
+ * We put '-0' suffix to the name to get the same name format as we
+ * used to get. Since this is called only once, we get a unique name.
+ */
+ ret = bdi_register(bdi, "%.28s-0", name);
if (ret)
- kfree(bdi);
+ bdi_put(bdi);
return ret ? ERR_PTR(ret) : bdi;
}
@@ -1813,8 +1816,7 @@ static int __init init_mtd(void)
out_procfs:
if (proc_mtd)
remove_proc_entry("mtd", NULL);
- bdi_destroy(mtd_bdi);
- kfree(mtd_bdi);
+ bdi_put(mtd_bdi);
err_bdi:
class_unregister(&mtd_class);
err_reg:
@@ -1828,8 +1830,7 @@ static void __exit cleanup_mtd(void)
if (proc_mtd)
remove_proc_entry("mtd", NULL);
class_unregister(&mtd_class);
- bdi_destroy(mtd_bdi);
- kfree(mtd_bdi);
+ bdi_put(mtd_bdi);
idr_destroy(&mtd_idr);
}
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 20c02a3b7417c..e43fea896d1ed 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -18,6 +18,7 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/major.h>
+#include <linux/backing-dev.h>
/*
* compare superblocks to see if they're equivalent
@@ -38,6 +39,8 @@ static int get_sb_mtd_compare(struct super_block *sb, void *_mtd)
return 0;
}
+extern struct backing_dev_info *mtd_bdi;
+
/*
* mark the superblock by the MTD device it is using
* - set the device number to be the correct MTD block device for pesuperstence
@@ -49,7 +52,8 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
sb->s_mtd = mtd;
sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
- sb->s_bdi = mtd->backing_dev_info;
+ sb->s_bdi = bdi_get(mtd_bdi);
+
return 0;
}
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index c80869e60909c..51f2be8889b57 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -347,7 +347,7 @@ static int ubiblock_init_request(void *data, struct request *req,
return 0;
}
-static struct blk_mq_ops ubiblock_mq_ops = {
+static const struct blk_mq_ops ubiblock_mq_ops = {
.queue_rq = ubiblock_queue_rq,
.init_request = ubiblock_init_request,
};
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index eeb409c287b8e..bf6729b1d8bf7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -49,10 +49,9 @@ unsigned char shutdown_timeout = 5;
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
-unsigned int nvme_max_retries = 5;
-module_param_named(max_retries, nvme_max_retries, uint, 0644);
+static u8 nvme_max_retries = 5;
+module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-EXPORT_SYMBOL_GPL(nvme_max_retries);
static int nvme_char_major;
module_param(nvme_char_major, int, 0);
@@ -67,6 +66,57 @@ static DEFINE_SPINLOCK(dev_list_lock);
static struct class *nvme_class;
+static int nvme_error_status(struct request *req)
+{
+ switch (nvme_req(req)->status & 0x7ff) {
+ case NVME_SC_SUCCESS:
+ return 0;
+ case NVME_SC_CAP_EXCEEDED:
+ return -ENOSPC;
+ default:
+ return -EIO;
+
+ /*
+ * XXX: these errors are a nasty side-band protocol to
+ * drivers/md/dm-mpath.c:noretry_error() that aren't documented
+ * anywhere..
+ */
+ case NVME_SC_CMD_SEQ_ERROR:
+ return -EILSEQ;
+ case NVME_SC_ONCS_NOT_SUPPORTED:
+ return -EOPNOTSUPP;
+ case NVME_SC_WRITE_FAULT:
+ case NVME_SC_READ_ERROR:
+ case NVME_SC_UNWRITTEN_BLOCK:
+ return -ENODATA;
+ }
+}
+
+static inline bool nvme_req_needs_retry(struct request *req)
+{
+ if (blk_noretry_request(req))
+ return false;
+ if (nvme_req(req)->status & NVME_SC_DNR)
+ return false;
+ if (jiffies - req->start_time >= req->timeout)
+ return false;
+ if (nvme_req(req)->retries >= nvme_max_retries)
+ return false;
+ return true;
+}
+
+void nvme_complete_rq(struct request *req)
+{
+ if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
+ return;
+ }
+
+ blk_mq_end_request(req, nvme_error_status(req));
+}
+EXPORT_SYMBOL_GPL(nvme_complete_rq);
+
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
int status;
@@ -80,7 +130,9 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
status = NVME_SC_ABORT_REQ;
if (blk_queue_dying(req->q))
status |= NVME_SC_DNR;
- blk_mq_complete_request(req, status);
+ nvme_req(req)->status = status;
+ blk_mq_complete_request(req);
+
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -205,12 +257,6 @@ fail:
return NULL;
}
-void nvme_requeue_req(struct request *req)
-{
- blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
-}
-EXPORT_SYMBOL_GPL(nvme_requeue_req);
-
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid)
{
@@ -327,6 +373,12 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
{
int ret = BLK_MQ_RQ_QUEUE_OK;
+ if (!(req->rq_flags & RQF_DONTPREP)) {
+ nvme_req(req)->retries = 0;
+ nvme_req(req)->flags = 0;
+ req->rq_flags |= RQF_DONTPREP;
+ }
+
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
@@ -335,6 +387,8 @@ int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
+ case REQ_OP_WRITE_ZEROES:
+ /* currently only aliased to deallocate for a few ctrls: */
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
@@ -378,7 +432,10 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
- ret = req->errors;
+ if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else
+ ret = nvme_req(req)->status;
out:
blk_mq_free_request(req);
return ret;
@@ -463,7 +520,10 @@ int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
}
submit:
blk_execute_rq(req->q, disk, req, 0);
- ret = req->errors;
+ if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else
+ ret = nvme_req(req)->status;
if (result)
*result = le32_to_cpu(nvme_req(req)->result.u32);
if (meta && !ret && !write) {
@@ -900,16 +960,14 @@ static void nvme_config_discard(struct nvme_ns *ns)
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- if (ctrl->quirks & NVME_QUIRK_DISCARD_ZEROES)
- ns->queue->limits.discard_zeroes_data = 1;
- else
- ns->queue->limits.discard_zeroes_data = 0;
-
ns->queue->limits.discard_alignment = logical_block_size;
ns->queue->limits.discard_granularity = logical_block_size;
blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+
+ if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
}
static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id)
@@ -2393,7 +2451,7 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
- blk_mq_freeze_queue_start(ns->queue);
+ blk_freeze_queue_start(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 5b7386f69f4de..990e6fb32a636 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -471,6 +471,16 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
}
EXPORT_SYMBOL_GPL(nvmf_connect_io_queue);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->opts->max_reconnects != -1 &&
+ ctrl->opts->nr_reconnects < ctrl->opts->max_reconnects)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
+
/**
* nvmf_register_transport() - NVMe Fabrics Library registration function.
* @ops: Transport ops instance to be registered to the
@@ -533,6 +543,7 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_QUEUE_SIZE, "queue_size=%d" },
{ NVMF_OPT_NR_IO_QUEUES, "nr_io_queues=%d" },
{ NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" },
+ { NVMF_OPT_CTRL_LOSS_TMO, "ctrl_loss_tmo=%d" },
{ NVMF_OPT_KATO, "keep_alive_tmo=%d" },
{ NVMF_OPT_HOSTNQN, "hostnqn=%s" },
{ NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
@@ -546,6 +557,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
char *options, *o, *p;
int token, ret = 0;
size_t nqnlen = 0;
+ int ctrl_loss_tmo = NVMF_DEF_CTRL_LOSS_TMO;
/* Set defaults */
opts->queue_size = NVMF_DEF_QUEUE_SIZE;
@@ -655,6 +667,16 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->kato = token;
break;
+ case NVMF_OPT_CTRL_LOSS_TMO:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (token < 0)
+ pr_warn("ctrl_loss_tmo < 0 will reconnect forever\n");
+ ctrl_loss_tmo = token;
+ break;
case NVMF_OPT_HOSTNQN:
if (opts->host) {
pr_err("hostnqn already user-assigned: %s\n",
@@ -710,6 +732,12 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
}
+ if (ctrl_loss_tmo < 0)
+ opts->max_reconnects = -1;
+ else
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+ opts->reconnect_delay);
+
if (!opts->host) {
kref_get(&nvmf_default_host->ref);
opts->host = nvmf_default_host;
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 156018182ce43..f5a9c1fb186f2 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -21,6 +21,8 @@
#define NVMF_MAX_QUEUE_SIZE 1024
#define NVMF_DEF_QUEUE_SIZE 128
#define NVMF_DEF_RECONNECT_DELAY 10
+/* default to 600 seconds of reconnect attempts before giving up */
+#define NVMF_DEF_CTRL_LOSS_TMO 600
/*
* Define a host as seen by the target. We allocate one at boot, but also
@@ -53,6 +55,7 @@ enum {
NVMF_OPT_HOSTNQN = 1 << 8,
NVMF_OPT_RECONNECT_DELAY = 1 << 9,
NVMF_OPT_HOST_TRADDR = 1 << 10,
+ NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
};
/**
@@ -77,6 +80,10 @@ enum {
* @discovery_nqn: indicates if the subsysnqn is the well-known discovery NQN.
* @kato: Keep-alive timeout.
* @host: Virtual NVMe host, contains the NQN and Host ID.
+ * @nr_reconnects: number of reconnect attempted since the last ctrl failure
+ * @max_reconnects: maximum number of allowed reconnect attempts before removing
+ * the controller, (-1) means reconnect forever, zero means remove
+ * immediately;
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -91,6 +98,8 @@ struct nvmf_ctrl_options {
bool discovery_nqn;
unsigned int kato;
struct nvmf_host *host;
+ int nr_reconnects;
+ int max_reconnects;
};
/*
@@ -133,5 +142,6 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
void nvmf_free_options(struct nvmf_ctrl_options *opts);
const char *nvmf_get_subsysnqn(struct nvme_ctrl *ctrl);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
+bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
#endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index d996ca73d3be3..ecc1048de8373 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -61,16 +61,23 @@ struct nvme_fc_queue {
unsigned long flags;
} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
+enum nvme_fcop_flags {
+ FCOP_FLAGS_TERMIO = (1 << 0),
+ FCOP_FLAGS_RELEASED = (1 << 1),
+ FCOP_FLAGS_COMPLETE = (1 << 2),
+};
+
struct nvmefc_ls_req_op {
struct nvmefc_ls_req ls_req;
- struct nvme_fc_ctrl *ctrl;
+ struct nvme_fc_rport *rport;
struct nvme_fc_queue *queue;
struct request *rq;
+ u32 flags;
int ls_error;
struct completion ls_done;
- struct list_head lsreq_list; /* ctrl->ls_req_list */
+ struct list_head lsreq_list; /* rport->ls_req_list */
bool req_queued;
};
@@ -120,6 +127,9 @@ struct nvme_fc_rport {
struct list_head endp_list; /* for lport->endp_list */
struct list_head ctrl_list;
+ struct list_head ls_req_list;
+ struct device *dev; /* physical device for dma */
+ struct nvme_fc_lport *lport;
spinlock_t lock;
struct kref ref;
} __aligned(sizeof(u64)); /* alignment for other things alloc'd with */
@@ -144,7 +154,6 @@ struct nvme_fc_ctrl {
u64 cap;
struct list_head ctrl_list; /* rport->ctrl_list */
- struct list_head ls_req_list;
struct blk_mq_tag_set admin_tag_set;
struct blk_mq_tag_set tag_set;
@@ -419,9 +428,12 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport,
INIT_LIST_HEAD(&newrec->endp_list);
INIT_LIST_HEAD(&newrec->ctrl_list);
+ INIT_LIST_HEAD(&newrec->ls_req_list);
kref_init(&newrec->ref);
spin_lock_init(&newrec->lock);
newrec->remoteport.localport = &lport->localport;
+ newrec->dev = lport->dev;
+ newrec->lport = lport;
newrec->remoteport.private = &newrec[1];
newrec->remoteport.port_role = pinfo->port_role;
newrec->remoteport.node_name = pinfo->node_name;
@@ -444,7 +456,6 @@ out_kfree_rport:
out_reghost_failed:
*portptr = NULL;
return ret;
-
}
EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport);
@@ -487,6 +498,30 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport)
return kref_get_unless_zero(&rport->ref);
}
+static int
+nvme_fc_abort_lsops(struct nvme_fc_rport *rport)
+{
+ struct nvmefc_ls_req_op *lsop;
+ unsigned long flags;
+
+restart:
+ spin_lock_irqsave(&rport->lock, flags);
+
+ list_for_each_entry(lsop, &rport->ls_req_list, lsreq_list) {
+ if (!(lsop->flags & FCOP_FLAGS_TERMIO)) {
+ lsop->flags |= FCOP_FLAGS_TERMIO;
+ spin_unlock_irqrestore(&rport->lock, flags);
+ rport->lport->ops->ls_abort(&rport->lport->localport,
+ &rport->remoteport,
+ &lsop->ls_req);
+ goto restart;
+ }
+ }
+ spin_unlock_irqrestore(&rport->lock, flags);
+
+ return 0;
+}
+
/**
* nvme_fc_unregister_remoteport - transport entry point called by an
* LLDD to deregister/remove a previously
@@ -522,6 +557,8 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
spin_unlock_irqrestore(&rport->lock, flags);
+ nvme_fc_abort_lsops(rport);
+
nvme_fc_rport_put(rport);
return 0;
}
@@ -624,16 +661,16 @@ static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
static void
-__nvme_fc_finish_ls_req(struct nvme_fc_ctrl *ctrl,
- struct nvmefc_ls_req_op *lsop)
+__nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
{
+ struct nvme_fc_rport *rport = lsop->rport;
struct nvmefc_ls_req *lsreq = &lsop->ls_req;
unsigned long flags;
- spin_lock_irqsave(&ctrl->lock, flags);
+ spin_lock_irqsave(&rport->lock, flags);
if (!lsop->req_queued) {
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ spin_unlock_irqrestore(&rport->lock, flags);
return;
}
@@ -641,56 +678,71 @@ __nvme_fc_finish_ls_req(struct nvme_fc_ctrl *ctrl,
lsop->req_queued = false;
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ spin_unlock_irqrestore(&rport->lock, flags);
- fc_dma_unmap_single(ctrl->dev, lsreq->rqstdma,
+ fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
(lsreq->rqstlen + lsreq->rsplen),
DMA_BIDIRECTIONAL);
- nvme_fc_ctrl_put(ctrl);
+ nvme_fc_rport_put(rport);
}
static int
-__nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl,
+__nvme_fc_send_ls_req(struct nvme_fc_rport *rport,
struct nvmefc_ls_req_op *lsop,
void (*done)(struct nvmefc_ls_req *req, int status))
{
struct nvmefc_ls_req *lsreq = &lsop->ls_req;
unsigned long flags;
- int ret;
+ int ret = 0;
- if (!nvme_fc_ctrl_get(ctrl))
+ if (rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
+ return -ECONNREFUSED;
+
+ if (!nvme_fc_rport_get(rport))
return -ESHUTDOWN;
lsreq->done = done;
- lsop->ctrl = ctrl;
+ lsop->rport = rport;
lsop->req_queued = false;
INIT_LIST_HEAD(&lsop->lsreq_list);
init_completion(&lsop->ls_done);
- lsreq->rqstdma = fc_dma_map_single(ctrl->dev, lsreq->rqstaddr,
+ lsreq->rqstdma = fc_dma_map_single(rport->dev, lsreq->rqstaddr,
lsreq->rqstlen + lsreq->rsplen,
DMA_BIDIRECTIONAL);
- if (fc_dma_mapping_error(ctrl->dev, lsreq->rqstdma)) {
- nvme_fc_ctrl_put(ctrl);
- dev_err(ctrl->dev,
- "els request command failed EFAULT.\n");
- return -EFAULT;
+ if (fc_dma_mapping_error(rport->dev, lsreq->rqstdma)) {
+ ret = -EFAULT;
+ goto out_putrport;
}
lsreq->rspdma = lsreq->rqstdma + lsreq->rqstlen;
- spin_lock_irqsave(&ctrl->lock, flags);
+ spin_lock_irqsave(&rport->lock, flags);
- list_add_tail(&lsop->lsreq_list, &ctrl->ls_req_list);
+ list_add_tail(&lsop->lsreq_list, &rport->ls_req_list);
lsop->req_queued = true;
- spin_unlock_irqrestore(&ctrl->lock, flags);
+ spin_unlock_irqrestore(&rport->lock, flags);
- ret = ctrl->lport->ops->ls_req(&ctrl->lport->localport,
- &ctrl->rport->remoteport, lsreq);
+ ret = rport->lport->ops->ls_req(&rport->lport->localport,
+ &rport->remoteport, lsreq);
if (ret)
- lsop->ls_error = ret;
+ goto out_unlink;
+
+ return 0;
+
+out_unlink:
+ lsop->ls_error = ret;
+ spin_lock_irqsave(&rport->lock, flags);
+ lsop->req_queued = false;
+ list_del(&lsop->lsreq_list);
+ spin_unlock_irqrestore(&rport->lock, flags);
+ fc_dma_unmap_single(rport->dev, lsreq->rqstdma,
+ (lsreq->rqstlen + lsreq->rsplen),
+ DMA_BIDIRECTIONAL);
+out_putrport:
+ nvme_fc_rport_put(rport);
return ret;
}
@@ -705,15 +757,15 @@ nvme_fc_send_ls_req_done(struct nvmefc_ls_req *lsreq, int status)
}
static int
-nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
+nvme_fc_send_ls_req(struct nvme_fc_rport *rport, struct nvmefc_ls_req_op *lsop)
{
struct nvmefc_ls_req *lsreq = &lsop->ls_req;
struct fcnvme_ls_rjt *rjt = lsreq->rspaddr;
int ret;
- ret = __nvme_fc_send_ls_req(ctrl, lsop, nvme_fc_send_ls_req_done);
+ ret = __nvme_fc_send_ls_req(rport, lsop, nvme_fc_send_ls_req_done);
- if (!ret)
+ if (!ret) {
/*
* No timeout/not interruptible as we need the struct
* to exist until the lldd calls us back. Thus mandate
@@ -722,14 +774,14 @@ nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
*/
wait_for_completion(&lsop->ls_done);
- __nvme_fc_finish_ls_req(ctrl, lsop);
+ __nvme_fc_finish_ls_req(lsop);
- if (ret) {
- dev_err(ctrl->dev,
- "ls request command failed (%d).\n", ret);
- return ret;
+ ret = lsop->ls_error;
}
+ if (ret)
+ return ret;
+
/* ACC or RJT payload ? */
if (rjt->w0.ls_cmd == FCNVME_LS_RJT)
return -ENXIO;
@@ -737,19 +789,14 @@ nvme_fc_send_ls_req(struct nvme_fc_ctrl *ctrl, struct nvmefc_ls_req_op *lsop)
return 0;
}
-static void
-nvme_fc_send_ls_req_async(struct nvme_fc_ctrl *ctrl,
+static int
+nvme_fc_send_ls_req_async(struct nvme_fc_rport *rport,
struct nvmefc_ls_req_op *lsop,
void (*done)(struct nvmefc_ls_req *req, int status))
{
- int ret;
-
- ret = __nvme_fc_send_ls_req(ctrl, lsop, done);
-
/* don't wait for completion */
- if (ret)
- done(&lsop->ls_req, ret);
+ return __nvme_fc_send_ls_req(rport, lsop, done);
}
/* Validation Error indexes into the string table below */
@@ -839,7 +886,7 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
lsreq->rsplen = sizeof(*assoc_acc);
lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
- ret = nvme_fc_send_ls_req(ctrl, lsop);
+ ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
if (ret)
goto out_free_buffer;
@@ -848,11 +895,12 @@ nvme_fc_connect_admin_queue(struct nvme_fc_ctrl *ctrl,
/* validate the ACC response */
if (assoc_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
fcret = VERR_LSACC;
- if (assoc_acc->hdr.desc_list_len !=
+ else if (assoc_acc->hdr.desc_list_len !=
fcnvme_lsdesc_len(
sizeof(struct fcnvme_ls_cr_assoc_acc)))
fcret = VERR_CR_ASSOC_ACC_LEN;
- if (assoc_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+ else if (assoc_acc->hdr.rqst.desc_tag !=
+ cpu_to_be32(FCNVME_LSDESC_RQST))
fcret = VERR_LSDESC_RQST;
else if (assoc_acc->hdr.rqst.desc_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -946,7 +994,7 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
lsreq->rsplen = sizeof(*conn_acc);
lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
- ret = nvme_fc_send_ls_req(ctrl, lsop);
+ ret = nvme_fc_send_ls_req(ctrl->rport, lsop);
if (ret)
goto out_free_buffer;
@@ -955,10 +1003,10 @@ nvme_fc_connect_queue(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
/* validate the ACC response */
if (conn_acc->hdr.w0.ls_cmd != FCNVME_LS_ACC)
fcret = VERR_LSACC;
- if (conn_acc->hdr.desc_list_len !=
+ else if (conn_acc->hdr.desc_list_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_ls_cr_conn_acc)))
fcret = VERR_CR_CONN_ACC_LEN;
- if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
+ else if (conn_acc->hdr.rqst.desc_tag != cpu_to_be32(FCNVME_LSDESC_RQST))
fcret = VERR_LSDESC_RQST;
else if (conn_acc->hdr.rqst.desc_len !=
fcnvme_lsdesc_len(sizeof(struct fcnvme_lsdesc_rqst)))
@@ -997,14 +1045,8 @@ static void
nvme_fc_disconnect_assoc_done(struct nvmefc_ls_req *lsreq, int status)
{
struct nvmefc_ls_req_op *lsop = ls_req_to_lsop(lsreq);
- struct nvme_fc_ctrl *ctrl = lsop->ctrl;
- __nvme_fc_finish_ls_req(ctrl, lsop);
-
- if (status)
- dev_err(ctrl->dev,
- "disconnect assoc ls request command failed (%d).\n",
- status);
+ __nvme_fc_finish_ls_req(lsop);
/* fc-nvme iniator doesn't care about success or failure of cmd */
@@ -1035,6 +1077,7 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
struct fcnvme_ls_disconnect_acc *discon_acc;
struct nvmefc_ls_req_op *lsop;
struct nvmefc_ls_req *lsreq;
+ int ret;
lsop = kzalloc((sizeof(*lsop) +
ctrl->lport->ops->lsrqst_priv_sz +
@@ -1077,7 +1120,10 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl)
lsreq->rsplen = sizeof(*discon_acc);
lsreq->timeout = NVME_FC_CONNECT_TIMEOUT_SEC;
- nvme_fc_send_ls_req_async(ctrl, lsop, nvme_fc_disconnect_assoc_done);
+ ret = nvme_fc_send_ls_req_async(ctrl->rport, lsop,
+ nvme_fc_disconnect_assoc_done);
+ if (ret)
+ kfree(lsop);
/* only meaningful part to terminating the association */
ctrl->association_id = 0;
@@ -1146,7 +1192,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
struct nvme_fc_ctrl *ctrl = op->ctrl;
struct nvme_fc_queue *queue = op->queue;
struct nvme_completion *cqe = &op->rsp_iu.cqe;
- u16 status;
+ __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
+ union nvme_result result;
/*
* WARNING:
@@ -1181,9 +1228,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
if (atomic_read(&op->state) == FCPOP_STATE_ABORTED)
- status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
- else
- status = freq->status;
+ status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
+ else if (freq->status)
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
/*
* For the linux implementation, if we have an unsuccesful
@@ -1211,10 +1258,10 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
*/
if (freq->transferred_length !=
be32_to_cpu(op->cmd_iu.data_len)) {
- status = -EIO;
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
goto done;
}
- op->nreq.result.u64 = 0;
+ result.u64 = 0;
break;
case sizeof(struct nvme_fc_ersp_iu):
@@ -1226,28 +1273,28 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
(freq->rcv_rsplen / 4) ||
be32_to_cpu(op->rsp_iu.xfrd_len) !=
freq->transferred_length ||
+ op->rsp_iu.status_code ||
op->rqno != le16_to_cpu(cqe->command_id))) {
- status = -EIO;
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
goto done;
}
- op->nreq.result = cqe->result;
- status = le16_to_cpu(cqe->status) >> 1;
+ result = cqe->result;
+ status = cqe->status;
break;
default:
- status = -EIO;
+ status = cpu_to_le16(NVME_SC_FC_TRANSPORT_ERROR << 1);
goto done;
}
done:
if (!queue->qnum && op->rqno >= AEN_CMDID_BASE) {
- nvme_complete_async_event(&queue->ctrl->ctrl, status,
- &op->nreq.result);
+ nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
nvme_fc_ctrl_put(ctrl);
return;
}
- blk_mq_complete_request(rq, status);
+ nvme_end_request(rq, status, result);
}
static int
@@ -1761,7 +1808,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
op->fcp_req.io_dir = io_dir;
op->fcp_req.transferred_length = 0;
op->fcp_req.rcv_rsplen = 0;
- op->fcp_req.status = 0;
+ op->fcp_req.status = NVME_SC_SUCCESS;
op->fcp_req.sqid = cpu_to_le16(queue->qnum);
/*
@@ -1923,32 +1970,18 @@ nvme_fc_complete_rq(struct request *rq)
{
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
struct nvme_fc_ctrl *ctrl = op->ctrl;
- int error = 0, state;
+ int state;
state = atomic_xchg(&op->state, FCPOP_STATE_IDLE);
nvme_cleanup_cmd(rq);
-
nvme_fc_unmap_data(ctrl, rq, op);
-
- if (unlikely(rq->errors)) {
- if (nvme_req_needs_retry(rq, rq->errors)) {
- nvme_requeue_req(rq);
- return;
- }
-
- if (blk_rq_is_passthrough(rq))
- error = rq->errors;
- else
- error = nvme_error_status(rq->errors);
- }
-
+ nvme_complete_rq(rq);
nvme_fc_ctrl_put(ctrl);
- blk_mq_end_request(rq, error);
}
-static struct blk_mq_ops nvme_fc_mq_ops = {
+static const struct blk_mq_ops nvme_fc_mq_ops = {
.queue_rq = nvme_fc_queue_rq,
.complete = nvme_fc_complete_rq,
.init_request = nvme_fc_init_request,
@@ -1959,7 +1992,7 @@ static struct blk_mq_ops nvme_fc_mq_ops = {
.timeout = nvme_fc_timeout,
};
-static struct blk_mq_ops nvme_fc_admin_mq_ops = {
+static const struct blk_mq_ops nvme_fc_admin_mq_ops = {
.queue_rq = nvme_fc_queue_rq,
.complete = nvme_fc_complete_rq,
.init_request = nvme_fc_init_admin_request,
@@ -2314,7 +2347,6 @@ __nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->ctrl_list);
- INIT_LIST_HEAD(&ctrl->ls_req_list);
ctrl->lport = lport;
ctrl->rport = rport;
ctrl->dev = lport->dev;
@@ -2546,11 +2578,20 @@ static struct nvmf_transport_ops nvme_fc_transport = {
static int __init nvme_fc_init_module(void)
{
+ int ret;
+
nvme_fc_wq = create_workqueue("nvme_fc_wq");
if (!nvme_fc_wq)
return -ENOMEM;
- return nvmf_register_transport(&nvme_fc_transport);
+ ret = nvmf_register_transport(&nvme_fc_transport);
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ destroy_workqueue(nvme_fc_wq);
+ return ret;
}
static void __exit nvme_fc_exit_module(void)
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 21cac8523bd8e..de61a4a03d782 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -241,9 +241,9 @@ static inline void _nvme_nvm_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
- BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 128);
+ BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
BUILD_BUG_ON(sizeof(struct nvme_nvm_id) != 4096);
- BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_nvm_bb_tbl) != 64);
}
static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
@@ -324,7 +324,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id)
nvm_id->cap = le32_to_cpu(nvme_nvm_id->cap);
nvm_id->dom = le32_to_cpu(nvme_nvm_id->dom);
memcpy(&nvm_id->ppaf, &nvme_nvm_id->ppaf,
- sizeof(struct nvme_nvm_addr_format));
+ sizeof(struct nvm_addr_format));
ret = init_grps(nvm_id, nvme_nvm_id);
out:
@@ -484,7 +484,7 @@ static void nvme_nvm_end_io(struct request *rq, int error)
struct nvm_rq *rqd = rq->end_io_data;
rqd->ppa_status = nvme_req(rq)->result.u64;
- rqd->error = error;
+ rqd->error = nvme_req(rq)->status;
nvm_end_io(rqd);
kfree(nvme_req(rq)->cmd);
@@ -510,12 +510,12 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
}
rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
- rq->ioprio = bio_prio(bio);
- if (bio_has_data(bio))
- rq->nr_phys_segments = bio_phys_segments(q, bio);
-
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
+ if (bio) {
+ blk_init_request_from_bio(rq, bio);
+ } else {
+ rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+ rq->__data_len = 0;
+ }
nvme_nvm_rqtocmd(rq, rqd, ns, cmd);
@@ -526,21 +526,6 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
return 0;
}
-static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- struct request_queue *q = dev->q;
- struct nvme_ns *ns = q->queuedata;
- struct nvme_nvm_command c = {};
-
- c.erase.opcode = NVM_OP_ERASE;
- c.erase.nsid = cpu_to_le32(ns->ns_id);
- c.erase.spba = cpu_to_le64(rqd->ppa_addr.ppa);
- c.erase.length = cpu_to_le16(rqd->nr_ppas - 1);
- c.erase.control = cpu_to_le16(rqd->flags);
-
- return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0);
-}
-
static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
{
struct nvme_ns *ns = nvmdev->q->queuedata;
@@ -576,7 +561,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = {
.set_bb_tbl = nvme_nvm_set_bb_tbl,
.submit_io = nvme_nvm_submit_io,
- .erase_block = nvme_nvm_erase_block,
.create_dma_pool = nvme_nvm_create_dma_pool,
.destroy_dma_pool = nvme_nvm_destroy_dma_pool,
@@ -611,7 +595,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q,
__le64 *metadata = NULL;
dma_addr_t metadata_dma;
DECLARE_COMPLETION_ONSTACK(wait);
- int ret;
+ int ret = 0;
rq = nvme_alloc_request(q, (struct nvme_command *)vcmd, 0,
NVME_QID_ANY);
@@ -681,9 +665,12 @@ submit:
wait_for_completion_io(&wait);
- ret = nvme_error_status(rq->errors);
+ if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
+ ret = -EINTR;
+ else if (nvme_req(rq)->status & 0x7ff)
+ ret = -EIO;
if (result)
- *result = rq->errors & 0x7ff;
+ *result = nvme_req(rq)->status & 0x7ff;
if (status)
*status = le64_to_cpu(nvme_req(rq)->result.u64);
@@ -766,7 +753,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3);
/* cdw11-12 */
c.ph_rw.length = cpu_to_le16(vcmd.nppas);
- c.ph_rw.control = cpu_to_le32(vcmd.control);
+ c.ph_rw.control = cpu_to_le16(vcmd.control);
c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
@@ -809,6 +796,8 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
struct request_queue *q = ns->queue;
struct nvm_dev *dev;
+ _nvme_nvm_check_size();
+
dev = nvm_alloc_dev(node);
if (!dev)
return -ENOMEM;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab2d6ec7eb5cc..29c708ca9621c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -21,16 +21,6 @@
#include <linux/lightnvm.h>
#include <linux/sed-opal.h>
-enum {
- /*
- * Driver internal status code for commands that were cancelled due
- * to timeouts or controller shutdown. The value is negative so
- * that it a) doesn't overlap with the unsigned hardware error codes,
- * and b) can easily be tested for.
- */
- NVME_SC_CANCELLED = -EINTR,
-};
-
extern unsigned char nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
@@ -43,8 +33,6 @@ extern unsigned char shutdown_timeout;
#define NVME_DEFAULT_KATO 5
#define NVME_KATO_GRACE 10
-extern unsigned int nvme_max_retries;
-
enum {
NVME_NS_LBA = 0,
NVME_NS_LIGHTNVM = 1,
@@ -68,10 +56,10 @@ enum nvme_quirks {
NVME_QUIRK_IDENTIFY_CNS = (1 << 1),
/*
- * The controller deterministically returns O's on reads to discarded
- * logical blocks.
+ * The controller deterministically returns O's on reads to
+ * logical blocks that deallocate was called on.
*/
- NVME_QUIRK_DISCARD_ZEROES = (1 << 2),
+ NVME_QUIRK_DEALLOCATE_ZEROES = (1 << 2),
/*
* The controller needs a delay before starts checking the device
@@ -97,6 +85,13 @@ enum nvme_quirks {
struct nvme_request {
struct nvme_command *cmd;
union nvme_result result;
+ u8 retries;
+ u8 flags;
+ u16 status;
+};
+
+enum {
+ NVME_REQ_CANCELLED = (1 << 0),
};
static inline struct nvme_request *nvme_req(struct request *req)
@@ -254,25 +249,17 @@ static inline void nvme_cleanup_cmd(struct request *req)
}
}
-static inline int nvme_error_status(u16 status)
+static inline void nvme_end_request(struct request *req, __le16 status,
+ union nvme_result result)
{
- switch (status & 0x7ff) {
- case NVME_SC_SUCCESS:
- return 0;
- case NVME_SC_CAP_EXCEEDED:
- return -ENOSPC;
- default:
- return -EIO;
- }
-}
+ struct nvme_request *rq = nvme_req(req);
-static inline bool nvme_req_needs_retry(struct request *req, u16 status)
-{
- return !(status & NVME_SC_DNR || blk_noretry_request(req)) &&
- (jiffies - req->start_time) < req->timeout &&
- req->retries < nvme_max_retries;
+ rq->status = le16_to_cpu(status) >> 1;
+ rq->result = result;
+ blk_mq_complete_request(req);
}
+void nvme_complete_rq(struct request *req);
void nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
@@ -307,7 +294,6 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags, int qid);
-void nvme_requeue_req(struct request *req);
int nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmd);
int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5d309535abbd6..c8541c3dcd19d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -104,8 +104,22 @@ struct nvme_dev {
u32 cmbloc;
struct nvme_ctrl ctrl;
struct completion ioq_wait;
+ u32 *dbbuf_dbs;
+ dma_addr_t dbbuf_dbs_dma_addr;
+ u32 *dbbuf_eis;
+ dma_addr_t dbbuf_eis_dma_addr;
};
+static inline unsigned int sq_idx(unsigned int qid, u32 stride)
+{
+ return qid * 2 * stride;
+}
+
+static inline unsigned int cq_idx(unsigned int qid, u32 stride)
+{
+ return (qid * 2 + 1) * stride;
+}
+
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
return container_of(ctrl, struct nvme_dev, ctrl);
@@ -134,6 +148,10 @@ struct nvme_queue {
u16 qid;
u8 cq_phase;
u8 cqe_seen;
+ u32 *dbbuf_sq_db;
+ u32 *dbbuf_cq_db;
+ u32 *dbbuf_sq_ei;
+ u32 *dbbuf_cq_ei;
};
/*
@@ -172,6 +190,112 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+}
+
+static inline unsigned int nvme_dbbuf_size(u32 stride)
+{
+ return ((num_possible_cpus() + 1) * 8 * stride);
+}
+
+static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+{
+ unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+ if (dev->dbbuf_dbs)
+ return 0;
+
+ dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
+ &dev->dbbuf_dbs_dma_addr,
+ GFP_KERNEL);
+ if (!dev->dbbuf_dbs)
+ return -ENOMEM;
+ dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
+ &dev->dbbuf_eis_dma_addr,
+ GFP_KERNEL);
+ if (!dev->dbbuf_eis) {
+ dma_free_coherent(dev->dev, mem_size,
+ dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+ dev->dbbuf_dbs = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
+{
+ unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+
+ if (dev->dbbuf_dbs) {
+ dma_free_coherent(dev->dev, mem_size,
+ dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
+ dev->dbbuf_dbs = NULL;
+ }
+ if (dev->dbbuf_eis) {
+ dma_free_coherent(dev->dev, mem_size,
+ dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
+ dev->dbbuf_eis = NULL;
+ }
+}
+
+static void nvme_dbbuf_init(struct nvme_dev *dev,
+ struct nvme_queue *nvmeq, int qid)
+{
+ if (!dev->dbbuf_dbs || !qid)
+ return;
+
+ nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
+ nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
+ nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
+ nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
+}
+
+static void nvme_dbbuf_set(struct nvme_dev *dev)
+{
+ struct nvme_command c;
+
+ if (!dev->dbbuf_dbs)
+ return;
+
+ memset(&c, 0, sizeof(c));
+ c.dbbuf.opcode = nvme_admin_dbbuf;
+ c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
+ c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
+
+ if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
+ dev_warn(dev->dev, "unable to set dbbuf\n");
+ /* Free memory and continue on */
+ nvme_dbbuf_dma_free(dev);
+ }
+}
+
+static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
+{
+ return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
+}
+
+/* Update dbbuf and return true if an MMIO is required */
+static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
+ volatile u32 *dbbuf_ei)
+{
+ if (dbbuf_db) {
+ u16 old_value;
+
+ /*
+ * Ensure that the queue is written before updating
+ * the doorbell in memory
+ */
+ wmb();
+
+ old_value = *dbbuf_db;
+ *dbbuf_db = value;
+
+ if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
+ return false;
+ }
+
+ return true;
}
/*
@@ -298,7 +422,9 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
if (++tail == nvmeq->q_depth)
tail = 0;
- writel(tail, nvmeq->q_db);
+ if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
+ nvmeq->dbbuf_sq_ei))
+ writel(tail, nvmeq->q_db);
nvmeq->sq_tail = tail;
}
@@ -327,10 +453,6 @@ static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
iod->nents = 0;
iod->length = size;
- if (!(rq->rq_flags & RQF_DONTPREP)) {
- rq->retries = 0;
- rq->rq_flags |= RQF_DONTPREP;
- }
return BLK_MQ_RQ_QUEUE_OK;
}
@@ -629,34 +751,12 @@ out_free_cmd:
return ret;
}
-static void nvme_complete_rq(struct request *req)
+static void nvme_pci_complete_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_dev *dev = iod->nvmeq->dev;
- int error = 0;
-
- nvme_unmap_data(dev, req);
-
- if (unlikely(req->errors)) {
- if (nvme_req_needs_retry(req, req->errors)) {
- req->retries++;
- nvme_requeue_req(req);
- return;
- }
-
- if (blk_rq_is_passthrough(req))
- error = req->errors;
- else
- error = nvme_error_status(req->errors);
- }
-
- if (unlikely(iod->aborted)) {
- dev_warn(dev->ctrl.device,
- "completing aborted command with status: %04x\n",
- req->errors);
- }
- blk_mq_end_request(req, error);
+ nvme_unmap_data(iod->nvmeq->dev, req);
+ nvme_complete_rq(req);
}
/* We read the CQE phase first to check if the rest of the entry is valid */
@@ -706,15 +806,16 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
}
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
- nvme_req(req)->result = cqe.result;
- blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1);
+ nvme_end_request(req, cqe.status, cqe.result);
}
if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
return;
if (likely(nvmeq->cq_vector >= 0))
- writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+ if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
+ nvmeq->dbbuf_cq_ei))
+ writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
nvmeq->cq_head = head;
nvmeq->cq_phase = phase;
@@ -746,10 +847,8 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_NONE;
}
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
{
- struct nvme_queue *nvmeq = hctx->driver_data;
-
if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
spin_lock_irq(&nvmeq->q_lock);
__nvme_process_cq(nvmeq, &tag);
@@ -762,6 +861,13 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
return 0;
}
+static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ return __nvme_poll(nvmeq, tag);
+}
+
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -813,7 +919,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq)
{
struct nvme_command c;
- int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+ int flags = NVME_QUEUE_PHYS_CONTIG;
/*
* Note: we (ab)use the fact the the prp fields survive if no data
@@ -844,9 +950,9 @@ static void abort_endio(struct request *req, int error)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = iod->nvmeq;
- u16 status = req->errors;
- dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", status);
+ dev_warn(nvmeq->dev->ctrl.device,
+ "Abort status: 0x%x", nvme_req(req)->status);
atomic_inc(&nvmeq->dev->ctrl.abort_limit);
blk_mq_free_request(req);
}
@@ -860,6 +966,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
struct nvme_command cmd;
/*
+ * Did we miss an interrupt?
+ */
+ if (__nvme_poll(nvmeq, req->tag)) {
+ dev_warn(dev->ctrl.device,
+ "I/O %d QID %d timeout, completion polled\n",
+ req->tag, nvmeq->qid);
+ return BLK_EH_HANDLED;
+ }
+
+ /*
* Shutdown immediately if controller times out while starting. The
* reset work will see the pci device disabled when it gets the forced
* cancellation error. All outstanding requests are completed on
@@ -870,7 +986,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
nvme_dev_disable(dev, false);
- req->errors = NVME_SC_CANCELLED;
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_HANDLED;
}
@@ -890,7 +1006,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
* Mark the request as handled, since the inline shutdown
* forces all outstanding requests to complete.
*/
- req->errors = NVME_SC_CANCELLED;
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_HANDLED;
}
@@ -1098,6 +1214,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+ nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
spin_unlock_irq(&nvmeq->q_lock);
}
@@ -1130,18 +1247,18 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
return result;
}
-static struct blk_mq_ops nvme_mq_admin_ops = {
+static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
- .complete = nvme_complete_rq,
+ .complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
.exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_admin_init_request,
.timeout = nvme_timeout,
};
-static struct blk_mq_ops nvme_mq_ops = {
+static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
- .complete = nvme_complete_rq,
+ .complete = nvme_pci_complete_rq,
.init_hctx = nvme_init_hctx,
.init_request = nvme_init_request,
.map_queues = nvme_pci_map_queues,
@@ -1570,6 +1687,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
if (blk_mq_alloc_tag_set(&dev->tagset))
return 0;
dev->ctrl.tagset = &dev->tagset;
+
+ nvme_dbbuf_set(dev);
} else {
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
@@ -1756,6 +1875,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
+ nvme_dbbuf_dma_free(dev);
put_device(dev->dev);
if (dev->tagset.tags)
blk_mq_free_tag_set(&dev->tagset);
@@ -1823,6 +1943,13 @@ static void nvme_reset_work(struct work_struct *work)
dev->ctrl.opal_dev = NULL;
}
+ if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
+ result = nvme_dbbuf_dma_alloc(dev);
+ if (result)
+ dev_warn(dev->dev,
+ "unable to allocate dma for dbbuf\n");
+ }
+
result = nvme_setup_io_queues(dev);
if (result)
goto out;
@@ -2159,13 +2286,13 @@ static const struct pci_error_handlers nvme_err_handler = {
static const struct pci_device_id nvme_id_table[] = {
{ PCI_VDEVICE(INTEL, 0x0953),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DISCARD_ZEROES, },
+ NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a53),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DISCARD_ZEROES, },
+ NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x0a54),
.driver_data = NVME_QUIRK_STRIPE_SIZE |
- NVME_QUIRK_DISCARD_ZEROES, },
+ NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 16f84eb0b95e8..29cf88ac3f61d 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -34,7 +34,7 @@
#include "fabrics.h"
-#define NVME_RDMA_CONNECT_TIMEOUT_MS 1000 /* 1 second */
+#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */
#define NVME_RDMA_MAX_SEGMENT_SIZE 0xffffff /* 24-bit SGL field */
@@ -118,7 +118,6 @@ struct nvme_rdma_ctrl {
struct nvme_rdma_qe async_event_sqe;
- int reconnect_delay;
struct delayed_work reconnect_work;
struct list_head list;
@@ -129,14 +128,8 @@ struct nvme_rdma_ctrl {
u64 cap;
u32 max_fr_pages;
- union {
- struct sockaddr addr;
- struct sockaddr_in addr_in;
- };
- union {
- struct sockaddr src_addr;
- struct sockaddr_in src_addr_in;
- };
+ struct sockaddr_storage addr;
+ struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
};
@@ -569,11 +562,12 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
return PTR_ERR(queue->cm_id);
}
- queue->cm_error = -ETIMEDOUT;
if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
- src_addr = &ctrl->src_addr;
+ src_addr = (struct sockaddr *)&ctrl->src_addr;
- ret = rdma_resolve_addr(queue->cm_id, src_addr, &ctrl->addr,
+ queue->cm_error = -ETIMEDOUT;
+ ret = rdma_resolve_addr(queue->cm_id, src_addr,
+ (struct sockaddr *)&ctrl->addr,
NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
dev_info(ctrl->ctrl.device,
@@ -712,6 +706,26 @@ free_ctrl:
kfree(ctrl);
}
+static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
+{
+ /* If we are resetting/deleting then do nothing */
+ if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) {
+ WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
+ ctrl->ctrl.state == NVME_CTRL_LIVE);
+ return;
+ }
+
+ if (nvmf_should_reconnect(&ctrl->ctrl)) {
+ dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
+ ctrl->ctrl.opts->reconnect_delay);
+ queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
+ ctrl->ctrl.opts->reconnect_delay * HZ);
+ } else {
+ dev_info(ctrl->ctrl.device, "Removing controller...\n");
+ queue_work(nvme_rdma_wq, &ctrl->delete_work);
+ }
+}
+
static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
{
struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
@@ -719,6 +733,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
bool changed;
int ret;
+ ++ctrl->ctrl.opts->nr_reconnects;
+
if (ctrl->queue_count > 1) {
nvme_rdma_free_io_queues(ctrl);
@@ -763,6 +779,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
+ ctrl->ctrl.opts->nr_reconnects = 0;
if (ctrl->queue_count > 1) {
nvme_start_queues(&ctrl->ctrl);
@@ -777,13 +794,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
stop_admin_q:
blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
requeue:
- /* Make sure we are not resetting/deleting */
- if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
- dev_info(ctrl->ctrl.device,
- "Failed reconnect attempt, requeueing...\n");
- queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
- ctrl->reconnect_delay * HZ);
- }
+ dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
+ ctrl->ctrl.opts->nr_reconnects);
+ nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery_work(struct work_struct *work)
@@ -810,11 +823,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_cancel_request, &ctrl->ctrl);
- dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
- ctrl->reconnect_delay);
-
- queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
- ctrl->reconnect_delay * HZ);
+ nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
@@ -1169,8 +1178,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
wc->ex.invalidate_rkey == req->mr->rkey)
req->mr->need_inval = false;
- req->req.result = cqe->result;
- blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
+ nvme_end_request(rq, cqe->status, cqe->result);
return ret;
}
@@ -1407,7 +1415,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
nvme_rdma_error_recovery(req->queue->ctrl);
/* fail with DNR on cmd timeout */
- rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
return BLK_EH_HANDLED;
}
@@ -1509,27 +1517,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_rdma_queue *queue = req->queue;
- int error = 0;
-
- nvme_rdma_unmap_data(queue, rq);
- if (unlikely(rq->errors)) {
- if (nvme_req_needs_retry(rq, rq->errors)) {
- nvme_requeue_req(rq);
- return;
- }
-
- if (blk_rq_is_passthrough(rq))
- error = rq->errors;
- else
- error = nvme_error_status(rq->errors);
- }
-
- blk_mq_end_request(rq, error);
+ nvme_rdma_unmap_data(req->queue, rq);
+ nvme_complete_rq(rq);
}
-static struct blk_mq_ops nvme_rdma_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_request,
@@ -1540,7 +1533,7 @@ static struct blk_mq_ops nvme_rdma_mq_ops = {
.timeout = nvme_rdma_timeout,
};
-static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
+static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
.queue_rq = nvme_rdma_queue_rq,
.complete = nvme_rdma_complete_rq,
.init_request = nvme_rdma_init_admin_request,
@@ -1857,27 +1850,13 @@ out_free_io_queues:
return ret;
}
-static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
-{
- u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
- size_t buflen = strlen(p);
-
- /* XXX: handle IPv6 addresses */
-
- if (buflen > INET_ADDRSTRLEN)
- return -EINVAL;
- if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
- return -EINVAL;
- in_addr->sin_family = AF_INET;
- return 0;
-}
-
static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_rdma_ctrl *ctrl;
int ret;
bool changed;
+ char *port;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
@@ -1885,40 +1864,33 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
- ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
+ if (opts->mask & NVMF_OPT_TRSVCID)
+ port = opts->trsvcid;
+ else
+ port = __stringify(NVME_RDMA_IP_PORT);
+
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->traddr, port, &ctrl->addr);
if (ret) {
- pr_err("malformed IP address passed: %s\n", opts->traddr);
+ pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
goto out_free_ctrl;
}
if (opts->mask & NVMF_OPT_HOST_TRADDR) {
- ret = nvme_rdma_parse_ipaddr(&ctrl->src_addr_in,
- opts->host_traddr);
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->host_traddr, NULL, &ctrl->src_addr);
if (ret) {
- pr_err("malformed src IP address passed: %s\n",
+ pr_err("malformed src address passed: %s\n",
opts->host_traddr);
goto out_free_ctrl;
}
}
- if (opts->mask & NVMF_OPT_TRSVCID) {
- u16 port;
-
- ret = kstrtou16(opts->trsvcid, 0, &port);
- if (ret)
- goto out_free_ctrl;
-
- ctrl->addr_in.sin_port = cpu_to_be16(port);
- } else {
- ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
- }
-
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
0 /* no quirks, we're perfect! */);
if (ret)
goto out_free_ctrl;
- ctrl->reconnect_delay = opts->reconnect_delay;
INIT_DELAYED_WORK(&ctrl->reconnect_work,
nvme_rdma_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
@@ -1977,7 +1949,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
- dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+ dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
kref_get(&ctrl->ctrl.kref);
@@ -2013,7 +1985,7 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
.name = "rdma",
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
- NVMF_OPT_HOST_TRADDR,
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
.create_ctrl = nvme_rdma_create_ctrl,
};
@@ -2055,12 +2027,20 @@ static int __init nvme_rdma_init_module(void)
return -ENOMEM;
ret = ib_register_client(&nvme_rdma_ib_client);
- if (ret) {
- destroy_workqueue(nvme_rdma_wq);
- return ret;
- }
+ if (ret)
+ goto err_destroy_wq;
+
+ ret = nvmf_register_transport(&nvme_rdma_transport);
+ if (ret)
+ goto err_unreg_client;
+
+ return 0;
- return nvmf_register_transport(&nvme_rdma_transport);
+err_unreg_client:
+ ib_unregister_client(&nvme_rdma_ib_client);
+err_destroy_wq:
+ destroy_workqueue(nvme_rdma_wq);
+ return ret;
}
static void __exit nvme_rdma_cleanup_module(void)
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 76450b0c55f1d..ff1f97006322b 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -121,7 +121,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
}
switch (req->cmd->get_log_page.lid) {
- case 0x01:
+ case NVME_LOG_ERROR:
/*
* We currently never set the More bit in the status field,
* so all error log entries are invalid and can be zeroed out.
@@ -129,7 +129,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
* mandatory log page.
*/
break;
- case 0x02:
+ case NVME_LOG_SMART:
/*
* XXX: fill out actual smart log
*
@@ -149,7 +149,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req)
goto err;
}
break;
- case 0x03:
+ case NVME_LOG_FW_SLOT:
/*
* We only support a single firmware slot which always is
* active, so we can zero out the whole firmware slot log and
@@ -480,31 +480,25 @@ static void nvmet_execute_keep_alive(struct nvmet_req *req)
nvmet_req_complete(req, 0);
}
-int nvmet_parse_admin_cmd(struct nvmet_req *req)
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
+ u16 ret;
req->ns = NULL;
- if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
- pr_err("nvmet: got admin cmd %d while CC.EN == 0\n",
- cmd->common.opcode);
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
- if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n",
- cmd->common.opcode);
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
+ ret = nvmet_check_ctrl_status(req, cmd);
+ if (unlikely(ret))
+ return ret;
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->data_len = nvmet_get_log_page_len(cmd);
switch (cmd->get_log_page.lid) {
- case 0x01:
- case 0x02:
- case 0x03:
+ case NVME_LOG_ERROR:
+ case NVME_LOG_SMART:
+ case NVME_LOG_FW_SLOT:
req->execute = nvmet_execute_get_log_page;
return 0;
}
@@ -545,6 +539,7 @@ int nvmet_parse_admin_cmd(struct nvmet_req *req)
return 0;
}
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+ req->sq->qid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 798653b329b28..cf90713043da0 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -273,8 +273,8 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE,
NULL);
if (IS_ERR(ns->bdev)) {
- pr_err("nvmet: failed to open block device %s: (%ld)\n",
- ns->device_path, PTR_ERR(ns->bdev));
+ pr_err("failed to open block device %s: (%ld)\n",
+ ns->device_path, PTR_ERR(ns->bdev));
ret = PTR_ERR(ns->bdev);
ns->bdev = NULL;
goto out_unlock;
@@ -661,6 +661,23 @@ out:
return status;
}
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
+{
+ if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
+ pr_err("got io cmd %d while CC.EN == 0 on qid = %d\n",
+ cmd->common.opcode, req->sq->qid);
+ return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ }
+
+ if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
+ pr_err("got io cmd %d while CSTS.RDY == 0 on qid = %d\n",
+ cmd->common.opcode, req->sq->qid);
+ req->ns = NULL;
+ return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ }
+ return 0;
+}
+
static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
const char *hostnqn)
{
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index af8aabf053350..1aaf597e81fc7 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -159,15 +159,15 @@ out:
nvmet_req_complete(req, status);
}
-int nvmet_parse_discovery_cmd(struct nvmet_req *req)
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
req->ns = NULL;
if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got cmd %d while not ready\n",
- cmd->common.opcode);
+ pr_err("got cmd %d while not ready\n",
+ cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
@@ -180,8 +180,8 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
req->execute = nvmet_execute_get_disc_log_page;
return 0;
default:
- pr_err("nvmet: unsupported get_log_page lid %d\n",
- cmd->get_log_page.lid);
+ pr_err("unsupported get_log_page lid %d\n",
+ cmd->get_log_page.lid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
case nvme_admin_identify:
@@ -192,17 +192,16 @@ int nvmet_parse_discovery_cmd(struct nvmet_req *req)
nvmet_execute_identify_disc_ctrl;
return 0;
default:
- pr_err("nvmet: unsupported identify cns %d\n",
- cmd->identify.cns);
+ pr_err("unsupported identify cns %d\n",
+ cmd->identify.cns);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
default:
- pr_err("nvmet: unsupported cmd %d\n",
- cmd->common.opcode);
+ pr_err("unsupported cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d\n", cmd->common.opcode);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 8bd022af3df67..3cc17269504bf 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -73,7 +73,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
nvmet_req_complete(req, status);
}
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req)
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
@@ -122,7 +122,15 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
struct nvmet_ctrl *ctrl = NULL;
u16 status = 0;
- d = kmap(sg_page(req->sg)) + req->sg->offset;
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto complete;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+ if (status)
+ goto out;
/* zero out initial completion result, assign values as needed */
req->rsp->result.u32 = 0;
@@ -143,7 +151,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
}
status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
- le32_to_cpu(c->kato), &ctrl);
+ le32_to_cpu(c->kato), &ctrl);
if (status)
goto out;
@@ -158,7 +166,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
out:
- kunmap(sg_page(req->sg));
+ kfree(d);
+complete:
nvmet_req_complete(req, status);
}
@@ -170,7 +179,15 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
u16 qid = le16_to_cpu(c->qid);
u16 status = 0;
- d = kmap(sg_page(req->sg)) + req->sg->offset;
+ d = kmalloc(sizeof(*d), GFP_KERNEL);
+ if (!d) {
+ status = NVME_SC_INTERNAL;
+ goto complete;
+ }
+
+ status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+ if (status)
+ goto out;
/* zero out initial completion result, assign values as needed */
req->rsp->result.u32 = 0;
@@ -183,8 +200,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
}
status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn,
- le16_to_cpu(d->cntlid),
- req, &ctrl);
+ le16_to_cpu(d->cntlid),
+ req, &ctrl);
if (status)
goto out;
@@ -205,7 +222,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
out:
- kunmap(sg_page(req->sg));
+ kfree(d);
+complete:
nvmet_req_complete(req, status);
return;
@@ -214,7 +232,7 @@ out_ctrl_put:
goto out;
}
-int nvmet_parse_connect_cmd(struct nvmet_req *req)
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 8f483ee7868c5..074bd3743b5fc 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -82,10 +82,13 @@ struct nvmet_fc_fcp_iod {
enum nvmet_fcp_datadir io_dir;
bool active;
bool abort;
+ bool aborted;
+ bool writedataactive;
spinlock_t flock;
struct nvmet_req req;
struct work_struct work;
+ struct work_struct done_work;
struct nvmet_fc_tgtport *tgtport;
struct nvmet_fc_tgt_queue *queue;
@@ -213,6 +216,7 @@ static DEFINE_IDA(nvmet_fc_tgtport_cnt);
static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
+static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
static void nvmet_fc_tgt_q_put(struct nvmet_fc_tgt_queue *queue);
@@ -414,9 +418,13 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
for (i = 0; i < queue->sqsize; fod++, i++) {
INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
+ INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
fod->tgtport = tgtport;
fod->queue = queue;
fod->active = false;
+ fod->abort = false;
+ fod->aborted = false;
+ fod->fcpreq = NULL;
list_add_tail(&fod->fcp_list, &queue->fod_list);
spin_lock_init(&fod->flock);
@@ -463,7 +471,6 @@ nvmet_fc_alloc_fcp_iod(struct nvmet_fc_tgt_queue *queue)
if (fod) {
list_del(&fod->fcp_list);
fod->active = true;
- fod->abort = false;
/*
* no queue reference is taken, as it was taken by the
* queue lookup just prior to the allocation. The iod
@@ -479,17 +486,30 @@ static void
nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
struct nvmet_fc_fcp_iod *fod)
{
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+ struct nvmet_fc_tgtport *tgtport = fod->tgtport;
unsigned long flags;
+ fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
+ sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+
+ fcpreq->nvmet_fc_private = NULL;
+
spin_lock_irqsave(&queue->qlock, flags);
list_add_tail(&fod->fcp_list, &fod->queue->fod_list);
fod->active = false;
+ fod->abort = false;
+ fod->aborted = false;
+ fod->writedataactive = false;
+ fod->fcpreq = NULL;
spin_unlock_irqrestore(&queue->qlock, flags);
/*
* release the reference taken at queue lookup and fod allocation
*/
nvmet_fc_tgt_q_put(queue);
+
+ tgtport->ops->fcp_req_release(&tgtport->fc_target_port, fcpreq);
}
static int
@@ -616,32 +636,12 @@ nvmet_fc_tgt_q_get(struct nvmet_fc_tgt_queue *queue)
static void
-nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
- struct nvmefc_tgt_fcp_req *fcpreq)
-{
- int ret;
-
- fcpreq->op = NVMET_FCOP_ABORT;
- fcpreq->offset = 0;
- fcpreq->timeout = 0;
- fcpreq->transfer_length = 0;
- fcpreq->transferred_length = 0;
- fcpreq->fcp_error = 0;
- fcpreq->sg_cnt = 0;
-
- ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fcpreq);
- if (ret)
- /* should never reach here !! */
- WARN_ON(1);
-}
-
-
-static void
nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
{
+ struct nvmet_fc_tgtport *tgtport = queue->assoc->tgtport;
struct nvmet_fc_fcp_iod *fod = queue->fod;
unsigned long flags;
- int i;
+ int i, writedataactive;
bool disconnect;
disconnect = atomic_xchg(&queue->connected, 0);
@@ -652,7 +652,20 @@ nvmet_fc_delete_target_queue(struct nvmet_fc_tgt_queue *queue)
if (fod->active) {
spin_lock(&fod->flock);
fod->abort = true;
+ writedataactive = fod->writedataactive;
spin_unlock(&fod->flock);
+ /*
+ * only call lldd abort routine if waiting for
+ * writedata. other outstanding ops should finish
+ * on their own.
+ */
+ if (writedataactive) {
+ spin_lock(&fod->flock);
+ fod->aborted = true;
+ spin_unlock(&fod->flock);
+ tgtport->ops->fcp_abort(
+ &tgtport->fc_target_port, fod->fcpreq);
+ }
}
}
spin_unlock_irqrestore(&queue->qlock, flags);
@@ -846,7 +859,8 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo,
int ret, idx;
if (!template->xmt_ls_rsp || !template->fcp_op ||
- !template->targetport_delete ||
+ !template->fcp_abort ||
+ !template->fcp_req_release || !template->targetport_delete ||
!template->max_hw_queues || !template->max_sgl_segments ||
!template->max_dif_sgl_segments || !template->dma_boundary) {
ret = -EINVAL;
@@ -1189,8 +1203,8 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport,
validation_errors[ret]);
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
- ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1281,8 +1295,9 @@ nvmet_fc_ls_create_connection(struct nvmet_fc_tgtport *tgtport,
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
(ret == VERR_NO_ASSOC) ?
- ELS_RJT_PROT : ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_INV_ASSOC :
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1369,8 +1384,12 @@ nvmet_fc_ls_disconnect(struct nvmet_fc_tgtport *tgtport,
validation_errors[ret]);
iod->lsreq->rsplen = nvmet_fc_format_rjt(acc,
NVME_FC_MAX_LS_BUFFER_SIZE, rqst->w0.ls_cmd,
- (ret == 8) ? ELS_RJT_PROT : ELS_RJT_LOGIC,
- ELS_EXPL_NONE, 0);
+ (ret == VERR_NO_ASSOC) ?
+ FCNVME_RJT_RC_INV_ASSOC :
+ (ret == VERR_NO_CONN) ?
+ FCNVME_RJT_RC_INV_CONN :
+ FCNVME_RJT_RC_LOGIC,
+ FCNVME_RJT_EXP_NONE, 0);
return;
}
@@ -1479,7 +1498,7 @@ nvmet_fc_handle_ls_rqst(struct nvmet_fc_tgtport *tgtport,
default:
iod->lsreq->rsplen = nvmet_fc_format_rjt(iod->rspbuf,
NVME_FC_MAX_LS_BUFFER_SIZE, w0->ls_cmd,
- ELS_RJT_INVAL, ELS_EXPL_NONE, 0);
+ FCNVME_RJT_RC_INVAL, FCNVME_RJT_EXP_NONE, 0);
}
nvmet_fc_xmt_ls_rsp(tgtport, iod);
@@ -1619,6 +1638,8 @@ nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
__free_page(sg_page(sg));
kfree(fod->data_sg);
+ fod->data_sg = NULL;
+ fod->data_sg_cnt = 0;
}
@@ -1704,6 +1725,26 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
static void nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq);
static void
+nvmet_fc_abort_op(struct nvmet_fc_tgtport *tgtport,
+ struct nvmet_fc_fcp_iod *fod)
+{
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+
+ /* data no longer needed */
+ nvmet_fc_free_tgt_pgs(fod);
+
+ /*
+ * if an ABTS was received or we issued the fcp_abort early
+ * don't call abort routine again.
+ */
+ /* no need to take lock - lock was taken earlier to get here */
+ if (!fod->aborted)
+ tgtport->ops->fcp_abort(&tgtport->fc_target_port, fcpreq);
+
+ nvmet_fc_free_fcp_iod(fod->queue, fod);
+}
+
+static void
nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
struct nvmet_fc_fcp_iod *fod)
{
@@ -1716,7 +1757,7 @@ nvmet_fc_xmt_fcp_rsp(struct nvmet_fc_tgtport *tgtport,
ret = tgtport->ops->fcp_op(&tgtport->fc_target_port, fod->fcpreq);
if (ret)
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
}
static void
@@ -1725,6 +1766,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
{
struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
struct scatterlist *sg, *datasg;
+ unsigned long flags;
u32 tlen, sg_off;
int ret;
@@ -1789,10 +1831,13 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
*/
fod->abort = true;
- if (op == NVMET_FCOP_WRITEDATA)
+ if (op == NVMET_FCOP_WRITEDATA) {
+ spin_lock_irqsave(&fod->flock, flags);
+ fod->writedataactive = false;
+ spin_unlock_irqrestore(&fod->flock, flags);
nvmet_req_complete(&fod->req,
NVME_SC_FC_TRANSPORT_ERROR);
- else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
+ } else /* NVMET_FCOP_READDATA or NVMET_FCOP_READDATA_RSP */ {
fcpreq->fcp_error = ret;
fcpreq->transferred_length = 0;
nvmet_fc_xmt_fcp_op_done(fod->fcpreq);
@@ -1800,32 +1845,54 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport,
}
}
+static inline bool
+__nvmet_fc_fod_op_abort(struct nvmet_fc_fcp_iod *fod, bool abort)
+{
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
+ struct nvmet_fc_tgtport *tgtport = fod->tgtport;
+
+ /* if in the middle of an io and we need to tear down */
+ if (abort) {
+ if (fcpreq->op == NVMET_FCOP_WRITEDATA) {
+ nvmet_req_complete(&fod->req,
+ NVME_SC_FC_TRANSPORT_ERROR);
+ return true;
+ }
+
+ nvmet_fc_abort_op(tgtport, fod);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * actual done handler for FCP operations when completed by the lldd
+ */
static void
-nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
{
- struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+ struct nvmefc_tgt_fcp_req *fcpreq = fod->fcpreq;
struct nvmet_fc_tgtport *tgtport = fod->tgtport;
unsigned long flags;
bool abort;
spin_lock_irqsave(&fod->flock, flags);
abort = fod->abort;
+ fod->writedataactive = false;
spin_unlock_irqrestore(&fod->flock, flags);
- /* if in the middle of an io and we need to tear down */
- if (abort && fcpreq->op != NVMET_FCOP_ABORT) {
- /* data no longer needed */
- nvmet_fc_free_tgt_pgs(fod);
-
- nvmet_req_complete(&fod->req, fcpreq->fcp_error);
- return;
- }
-
switch (fcpreq->op) {
case NVMET_FCOP_WRITEDATA:
+ if (__nvmet_fc_fod_op_abort(fod, abort))
+ return;
if (fcpreq->fcp_error ||
fcpreq->transferred_length != fcpreq->transfer_length) {
+ spin_lock(&fod->flock);
+ fod->abort = true;
+ spin_unlock(&fod->flock);
+
nvmet_req_complete(&fod->req,
NVME_SC_FC_TRANSPORT_ERROR);
return;
@@ -1833,6 +1900,10 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
fod->offset += fcpreq->transferred_length;
if (fod->offset != fod->total_length) {
+ spin_lock_irqsave(&fod->flock, flags);
+ fod->writedataactive = true;
+ spin_unlock_irqrestore(&fod->flock, flags);
+
/* transfer the next chunk */
nvmet_fc_transfer_fcp_data(tgtport, fod,
NVMET_FCOP_WRITEDATA);
@@ -1847,12 +1918,11 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
case NVMET_FCOP_READDATA:
case NVMET_FCOP_READDATA_RSP:
+ if (__nvmet_fc_fod_op_abort(fod, abort))
+ return;
if (fcpreq->fcp_error ||
fcpreq->transferred_length != fcpreq->transfer_length) {
- /* data no longer needed */
- nvmet_fc_free_tgt_pgs(fod);
-
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
return;
}
@@ -1861,8 +1931,6 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
if (fcpreq->op == NVMET_FCOP_READDATA_RSP) {
/* data no longer needed */
nvmet_fc_free_tgt_pgs(fod);
- fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
- sizeof(fod->rspiubuf), DMA_TO_DEVICE);
nvmet_fc_free_fcp_iod(fod->queue, fod);
return;
}
@@ -1885,19 +1953,38 @@ nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
break;
case NVMET_FCOP_RSP:
- case NVMET_FCOP_ABORT:
- fc_dma_sync_single_for_cpu(tgtport->dev, fod->rspdma,
- sizeof(fod->rspiubuf), DMA_TO_DEVICE);
+ if (__nvmet_fc_fod_op_abort(fod, abort))
+ return;
nvmet_fc_free_fcp_iod(fod->queue, fod);
break;
default:
- nvmet_fc_free_tgt_pgs(fod);
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
break;
}
}
+static void
+nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
+{
+ struct nvmet_fc_fcp_iod *fod =
+ container_of(work, struct nvmet_fc_fcp_iod, done_work);
+
+ nvmet_fc_fod_op_done(fod);
+}
+
+static void
+nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
+{
+ struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+ struct nvmet_fc_tgt_queue *queue = fod->queue;
+
+ if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
+ /* context switch so completion is not in ISR context */
+ queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
+ else
+ nvmet_fc_fod_op_done(fod);
+}
+
/*
* actual completion handler after execution by the nvmet layer
*/
@@ -1919,10 +2006,7 @@ __nvmet_fc_fcp_nvme_cmd_done(struct nvmet_fc_tgtport *tgtport,
fod->queue->sqhd = cqe->sq_head;
if (abort) {
- /* data no longer needed */
- nvmet_fc_free_tgt_pgs(fod);
-
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
return;
}
@@ -2018,8 +2102,8 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
&fod->queue->nvme_cq,
&fod->queue->nvme_sq,
&nvmet_fc_tgt_fcp_ops);
- if (!ret) { /* bad SQE content */
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ if (!ret) { /* bad SQE content or invalid ctrl state */
+ nvmet_fc_abort_op(tgtport, fod);
return;
}
@@ -2059,7 +2143,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport,
return;
transport_error:
- nvmet_fc_abort_op(tgtport, fod->fcpreq);
+ nvmet_fc_abort_op(tgtport, fod);
}
/*
@@ -2089,7 +2173,7 @@ nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
* If this routine returns error, the lldd should abort the exchange.
*
* @target_port: pointer to the (registered) target port the FCP CMD IU
- * was receive on.
+ * was received on.
* @fcpreq: pointer to a fcpreq request structure to be used to reference
* the exchange corresponding to the FCP Exchange.
* @cmdiubuf: pointer to the buffer containing the FCP CMD IU
@@ -2112,7 +2196,6 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
(be16_to_cpu(cmdiu->iu_len) != (sizeof(*cmdiu)/4)))
return -EIO;
-
queue = nvmet_fc_find_target_queue(tgtport,
be64_to_cpu(cmdiu->connection_id));
if (!queue)
@@ -2142,12 +2225,68 @@ nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *target_port,
((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
memcpy(&fod->cmdiubuf, cmdiubuf, cmdiubuf_len);
- queue_work_on(queue->cpu, queue->work_q, &fod->work);
+ if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
+ queue_work_on(queue->cpu, queue->work_q, &fod->work);
+ else
+ nvmet_fc_handle_fcp_rqst(tgtport, fod);
return 0;
}
EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_req);
+/**
+ * nvmet_fc_rcv_fcp_abort - transport entry point called by an LLDD
+ * upon the reception of an ABTS for a FCP command
+ *
+ * Notify the transport that an ABTS has been received for a FCP command
+ * that had been given to the transport via nvmet_fc_rcv_fcp_req(). The
+ * LLDD believes the command is still being worked on
+ * (template_ops->fcp_req_release() has not been called).
+ *
+ * The transport will wait for any outstanding work (an op to the LLDD,
+ * which the lldd should complete with error due to the ABTS; or the
+ * completion from the nvmet layer of the nvme command), then will
+ * stop processing and call the nvmet_fc_rcv_fcp_req() callback to
+ * return the i/o context to the LLDD. The LLDD may send the BA_ACC
+ * to the ABTS either after return from this function (assuming any
+ * outstanding op work has been terminated) or upon the callback being
+ * called.
+ *
+ * @target_port: pointer to the (registered) target port the FCP CMD IU
+ * was received on.
+ * @fcpreq: pointer to the fcpreq request structure that corresponds
+ * to the exchange that received the ABTS.
+ */
+void
+nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *target_port,
+ struct nvmefc_tgt_fcp_req *fcpreq)
+{
+ struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
+ struct nvmet_fc_tgt_queue *queue;
+ unsigned long flags;
+
+ if (!fod || fod->fcpreq != fcpreq)
+ /* job appears to have already completed, ignore abort */
+ return;
+
+ queue = fod->queue;
+
+ spin_lock_irqsave(&queue->qlock, flags);
+ if (fod->active) {
+ /*
+ * mark as abort. The abort handler, invoked upon completion
+ * of any work, will detect the aborted status and do the
+ * callback.
+ */
+ spin_lock(&fod->flock);
+ fod->abort = true;
+ fod->aborted = true;
+ spin_unlock(&fod->flock);
+ }
+ spin_unlock_irqrestore(&queue->qlock, flags);
+}
+EXPORT_SYMBOL_GPL(nvmet_fc_rcv_fcp_abort);
+
enum {
FCT_TRADDR_ERR = 0,
FCT_TRADDR_WWNN = 1 << 0,
@@ -2177,7 +2316,7 @@ nvmet_fc_parse_traddr(struct nvmet_fc_traddr *traddr, char *buf)
if (!options)
return -ENOMEM;
- while ((p = strsep(&o, ",\n")) != NULL) {
+ while ((p = strsep(&o, ":\n")) != NULL) {
if (!*p)
continue;
@@ -2238,6 +2377,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
if (!tgtport->port) {
tgtport->port = port;
port->priv = tgtport;
+ nvmet_fc_tgtport_get(tgtport);
ret = 0;
} else
ret = -EALREADY;
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 4e8e6a22bce16..aaa3dbe22bd5e 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -246,11 +246,19 @@ struct fcloop_lsreq {
struct fcloop_fcpreq {
struct fcloop_tport *tport;
struct nvmefc_fcp_req *fcpreq;
+ spinlock_t reqlock;
u16 status;
+ bool active;
+ bool aborted;
struct work_struct work;
struct nvmefc_tgt_fcp_req tgt_fcp_req;
};
+struct fcloop_ini_fcpreq {
+ struct nvmefc_fcp_req *fcpreq;
+ struct fcloop_fcpreq *tfcp_req;
+ struct work_struct iniwork;
+};
static inline struct fcloop_lsreq *
tgt_ls_req_to_lsreq(struct nvmefc_tgt_ls_req *tgt_lsreq)
@@ -341,7 +349,21 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
}
/*
- * FCP IO operation done. call back up initiator "done" flows.
+ * FCP IO operation done by initiator abort.
+ * call back up initiator "done" flows.
+ */
+static void
+fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+{
+ struct fcloop_ini_fcpreq *inireq =
+ container_of(work, struct fcloop_ini_fcpreq, iniwork);
+
+ inireq->fcpreq->done(inireq->fcpreq);
+}
+
+/*
+ * FCP IO operation done by target completion.
+ * call back up initiator "done" flows.
*/
static void
fcloop_tgt_fcprqst_done_work(struct work_struct *work)
@@ -349,12 +371,18 @@ fcloop_tgt_fcprqst_done_work(struct work_struct *work)
struct fcloop_fcpreq *tfcp_req =
container_of(work, struct fcloop_fcpreq, work);
struct fcloop_tport *tport = tfcp_req->tport;
- struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ struct nvmefc_fcp_req *fcpreq;
- if (tport->remoteport) {
+ spin_lock(&tfcp_req->reqlock);
+ fcpreq = tfcp_req->fcpreq;
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (tport->remoteport && fcpreq) {
fcpreq->status = tfcp_req->status;
fcpreq->done(fcpreq);
}
+
+ kfree(tfcp_req);
}
@@ -364,20 +392,25 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
void *hw_queue_handle,
struct nvmefc_fcp_req *fcpreq)
{
- struct fcloop_fcpreq *tfcp_req = fcpreq->private;
struct fcloop_rport *rport = remoteport->private;
+ struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+ struct fcloop_fcpreq *tfcp_req;
int ret = 0;
- INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+ if (!rport->targetport)
+ return -ECONNREFUSED;
- if (!rport->targetport) {
- tfcp_req->status = NVME_SC_FC_TRANSPORT_ERROR;
- schedule_work(&tfcp_req->work);
- return ret;
- }
+ tfcp_req = kzalloc(sizeof(*tfcp_req), GFP_KERNEL);
+ if (!tfcp_req)
+ return -ENOMEM;
+ inireq->fcpreq = fcpreq;
+ inireq->tfcp_req = tfcp_req;
+ INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
tfcp_req->fcpreq = fcpreq;
tfcp_req->tport = rport->targetport->private;
+ spin_lock_init(&tfcp_req->reqlock);
+ INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
fcpreq->cmdaddr, fcpreq->cmdlen);
@@ -444,63 +477,129 @@ fcloop_fcp_op(struct nvmet_fc_target_port *tgtport,
struct nvmefc_tgt_fcp_req *tgt_fcpreq)
{
struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
- struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ struct nvmefc_fcp_req *fcpreq;
u32 rsplen = 0, xfrlen = 0;
- int fcp_err = 0;
+ int fcp_err = 0, active, aborted;
u8 op = tgt_fcpreq->op;
+ spin_lock(&tfcp_req->reqlock);
+ fcpreq = tfcp_req->fcpreq;
+ active = tfcp_req->active;
+ aborted = tfcp_req->aborted;
+ tfcp_req->active = true;
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (unlikely(active))
+ /* illegal - call while i/o active */
+ return -EALREADY;
+
+ if (unlikely(aborted)) {
+ /* target transport has aborted i/o prior */
+ spin_lock(&tfcp_req->reqlock);
+ tfcp_req->active = false;
+ spin_unlock(&tfcp_req->reqlock);
+ tgt_fcpreq->transferred_length = 0;
+ tgt_fcpreq->fcp_error = -ECANCELED;
+ tgt_fcpreq->done(tgt_fcpreq);
+ return 0;
+ }
+
+ /*
+ * if fcpreq is NULL, the I/O has been aborted (from
+ * initiator side). For the target side, act as if all is well
+ * but don't actually move data.
+ */
+
switch (op) {
case NVMET_FCOP_WRITEDATA:
xfrlen = tgt_fcpreq->transfer_length;
- fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
- tgt_fcpreq->offset, xfrlen);
- fcpreq->transferred_length += xfrlen;
+ if (fcpreq) {
+ fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+ fcpreq->first_sgl, tgt_fcpreq->offset,
+ xfrlen);
+ fcpreq->transferred_length += xfrlen;
+ }
break;
case NVMET_FCOP_READDATA:
case NVMET_FCOP_READDATA_RSP:
xfrlen = tgt_fcpreq->transfer_length;
- fcloop_fcp_copy_data(op, tgt_fcpreq->sg, fcpreq->first_sgl,
- tgt_fcpreq->offset, xfrlen);
- fcpreq->transferred_length += xfrlen;
+ if (fcpreq) {
+ fcloop_fcp_copy_data(op, tgt_fcpreq->sg,
+ fcpreq->first_sgl, tgt_fcpreq->offset,
+ xfrlen);
+ fcpreq->transferred_length += xfrlen;
+ }
if (op == NVMET_FCOP_READDATA)
break;
/* Fall-Thru to RSP handling */
case NVMET_FCOP_RSP:
- rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
- fcpreq->rsplen : tgt_fcpreq->rsplen);
- memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
- if (rsplen < tgt_fcpreq->rsplen)
- fcp_err = -E2BIG;
- fcpreq->rcv_rsplen = rsplen;
- fcpreq->status = 0;
+ if (fcpreq) {
+ rsplen = ((fcpreq->rsplen < tgt_fcpreq->rsplen) ?
+ fcpreq->rsplen : tgt_fcpreq->rsplen);
+ memcpy(fcpreq->rspaddr, tgt_fcpreq->rspaddr, rsplen);
+ if (rsplen < tgt_fcpreq->rsplen)
+ fcp_err = -E2BIG;
+ fcpreq->rcv_rsplen = rsplen;
+ fcpreq->status = 0;
+ }
tfcp_req->status = 0;
break;
- case NVMET_FCOP_ABORT:
- tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
- break;
-
default:
fcp_err = -EINVAL;
break;
}
+ spin_lock(&tfcp_req->reqlock);
+ tfcp_req->active = false;
+ spin_unlock(&tfcp_req->reqlock);
+
tgt_fcpreq->transferred_length = xfrlen;
tgt_fcpreq->fcp_error = fcp_err;
tgt_fcpreq->done(tgt_fcpreq);
- if ((!fcp_err) && (op == NVMET_FCOP_RSP ||
- op == NVMET_FCOP_READDATA_RSP ||
- op == NVMET_FCOP_ABORT))
- schedule_work(&tfcp_req->work);
-
return 0;
}
static void
+fcloop_tgt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+ struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+ int active;
+
+ /*
+ * mark aborted only in case there were 2 threads in transport
+ * (one doing io, other doing abort) and only kills ops posted
+ * after the abort request
+ */
+ spin_lock(&tfcp_req->reqlock);
+ active = tfcp_req->active;
+ tfcp_req->aborted = true;
+ spin_unlock(&tfcp_req->reqlock);
+
+ tfcp_req->status = NVME_SC_FC_TRANSPORT_ABORTED;
+
+ /*
+ * nothing more to do. If io wasn't active, the transport should
+ * immediately call the req_release. If it was active, the op
+ * will complete, and the lldd should call req_release.
+ */
+}
+
+static void
+fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *tgt_fcpreq)
+{
+ struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
+
+ schedule_work(&tfcp_req->work);
+}
+
+static void
fcloop_ls_abort(struct nvme_fc_local_port *localport,
struct nvme_fc_remote_port *remoteport,
struct nvmefc_ls_req *lsreq)
@@ -513,6 +612,27 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
void *hw_queue_handle,
struct nvmefc_fcp_req *fcpreq)
{
+ struct fcloop_rport *rport = remoteport->private;
+ struct fcloop_ini_fcpreq *inireq = fcpreq->private;
+ struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+
+ if (!tfcp_req)
+ /* abort has already been called */
+ return;
+
+ if (rport->targetport)
+ nvmet_fc_rcv_fcp_abort(rport->targetport,
+ &tfcp_req->tgt_fcp_req);
+
+ /* break initiator/target relationship for io */
+ spin_lock(&tfcp_req->reqlock);
+ inireq->tfcp_req = NULL;
+ tfcp_req->fcpreq = NULL;
+ spin_unlock(&tfcp_req->reqlock);
+
+ /* post the aborted io completion */
+ fcpreq->status = -ECANCELED;
+ schedule_work(&inireq->iniwork);
}
static void
@@ -563,20 +683,23 @@ struct nvme_fc_port_template fctemplate = {
.local_priv_sz = sizeof(struct fcloop_lport),
.remote_priv_sz = sizeof(struct fcloop_rport),
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
- .fcprqst_priv_sz = sizeof(struct fcloop_fcpreq),
+ .fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
};
struct nvmet_fc_target_template tgttemplate = {
.targetport_delete = fcloop_targetport_delete,
.xmt_ls_rsp = fcloop_xmt_ls_rsp,
.fcp_op = fcloop_fcp_op,
+ .fcp_abort = fcloop_tgt_fcp_abort,
+ .fcp_req_release = fcloop_fcp_req_release,
.max_hw_queues = FCLOOP_HW_QUEUES,
.max_sgl_segments = FCLOOP_SGL_SEGS,
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G,
/* optional features */
- .target_features = NVMET_FCTGTFEAT_READDATA_RSP |
- NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED,
+ .target_features = NVMET_FCTGTFEAT_CMD_IN_ISR |
+ NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+ NVMET_FCTGTFEAT_OPDONE_IN_ISR,
/* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport),
};
diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 6b0baa9caab9f..c77940d80fc8e 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -184,7 +184,7 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
(req->ns->blksize_shift - 9)) + 1;
if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
- GFP_KERNEL, &bio, true))
+ GFP_KERNEL, &bio, 0))
status = NVME_SC_INTERNAL | NVME_SC_DNR;
if (bio) {
@@ -196,26 +196,19 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
}
}
-int nvmet_parse_io_cmd(struct nvmet_req *req)
+u16 nvmet_parse_io_cmd(struct nvmet_req *req)
{
struct nvme_command *cmd = req->cmd;
+ u16 ret;
- if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) {
- pr_err("nvmet: got io cmd %d while CC.EN == 0\n",
- cmd->common.opcode);
+ ret = nvmet_check_ctrl_status(req, cmd);
+ if (unlikely(ret)) {
req->ns = NULL;
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
- }
-
- if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
- pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n",
- cmd->common.opcode);
- req->ns = NULL;
- return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
+ return ret;
}
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
- if (!req->ns)
+ if (unlikely(!req->ns))
return NVME_SC_INVALID_NS | NVME_SC_DNR;
switch (cmd->common.opcode) {
@@ -237,7 +230,8 @@ int nvmet_parse_io_cmd(struct nvmet_req *req)
req->execute = nvmet_execute_write_zeroes;
return 0;
default:
- pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode);
+ pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
+ req->sq->qid);
return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
}
}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index c7b0b6a527083..304f1c87c160c 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -13,12 +13,10 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/scatterlist.h>
-#include <linux/delay.h>
#include <linux/blk-mq.h>
#include <linux/nvme.h>
#include <linux/module.h>
#include <linux/parser.h>
-#include <linux/t10-pi.h>
#include "nvmet.h"
#include "../host/nvme.h"
#include "../host/fabrics.h"
@@ -93,31 +91,26 @@ static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue)
static void nvme_loop_complete_rq(struct request *req)
{
struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
- int error = 0;
nvme_cleanup_cmd(req);
sg_free_table_chained(&iod->sg_table, true);
+ nvme_complete_rq(req);
+}
- if (unlikely(req->errors)) {
- if (nvme_req_needs_retry(req, req->errors)) {
- nvme_requeue_req(req);
- return;
- }
-
- if (blk_rq_is_passthrough(req))
- error = req->errors;
- else
- error = nvme_error_status(req->errors);
- }
+static struct blk_mq_tags *nvme_loop_tagset(struct nvme_loop_queue *queue)
+{
+ u32 queue_idx = nvme_loop_queue_idx(queue);
- blk_mq_end_request(req, error);
+ if (queue_idx == 0)
+ return queue->ctrl->admin_tag_set.tags[queue_idx];
+ return queue->ctrl->tag_set.tags[queue_idx - 1];
}
static void nvme_loop_queue_response(struct nvmet_req *req)
{
- struct nvme_loop_iod *iod =
- container_of(req, struct nvme_loop_iod, req);
- struct nvme_completion *cqe = &iod->rsp;
+ struct nvme_loop_queue *queue =
+ container_of(req->sq, struct nvme_loop_queue, nvme_sq);
+ struct nvme_completion *cqe = req->rsp;
/*
* AEN requests are special as they don't time out and can
@@ -125,15 +118,22 @@ static void nvme_loop_queue_response(struct nvmet_req *req)
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 &&
+ if (unlikely(nvme_loop_queue_idx(queue) == 0 &&
cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) {
- nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe->status,
+ nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
} else {
- struct request *rq = blk_mq_rq_from_pdu(iod);
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_loop_tagset(queue), cqe->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "tag 0x%x on queue %d not found\n",
+ cqe->command_id, nvme_loop_queue_idx(queue));
+ return;
+ }
- iod->nvme_req.result = cqe->result;
- blk_mq_complete_request(rq, le16_to_cpu(cqe->status) >> 1);
+ nvme_end_request(rq, cqe->status, cqe->result);
}
}
@@ -154,7 +154,7 @@ nvme_loop_timeout(struct request *rq, bool reserved)
schedule_work(&iod->queue->ctrl->reset_work);
/* fail with DNR on admin cmd timeout */
- rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+ nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
return BLK_EH_HANDLED;
}
@@ -268,7 +268,7 @@ static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
return 0;
}
-static struct blk_mq_ops nvme_loop_mq_ops = {
+static const struct blk_mq_ops nvme_loop_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.init_request = nvme_loop_init_request,
@@ -276,7 +276,7 @@ static struct blk_mq_ops nvme_loop_mq_ops = {
.timeout = nvme_loop_timeout,
};
-static struct blk_mq_ops nvme_loop_admin_mq_ops = {
+static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
.queue_rq = nvme_loop_queue_rq,
.complete = nvme_loop_complete_rq,
.init_request = nvme_loop_init_admin_request,
@@ -349,6 +349,19 @@ out_destroy_queues:
return ret;
}
+static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl)
+{
+ int i, ret;
+
+ for (i = 1; i < ctrl->queue_count; i++) {
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
{
int error;
@@ -490,7 +503,7 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
struct nvme_loop_ctrl *ctrl = container_of(work,
struct nvme_loop_ctrl, reset_work);
bool changed;
- int i, ret;
+ int ret;
nvme_loop_shutdown_ctrl(ctrl);
@@ -502,11 +515,9 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
if (ret)
goto out_destroy_admin;
- for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_destroy_io;
- }
+ ret = nvme_loop_connect_io_queues(ctrl);
+ if (ret)
+ goto out_destroy_io;
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
@@ -559,7 +570,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = {
static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
{
- int ret, i;
+ int ret;
ret = nvme_loop_init_io_queues(ctrl);
if (ret)
@@ -588,11 +599,9 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
goto out_free_tagset;
}
- for (i = 1; i < ctrl->queue_count; i++) {
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
- if (ret)
- goto out_cleanup_connect_q;
- }
+ ret = nvme_loop_connect_io_queues(ctrl);
+ if (ret)
+ goto out_cleanup_connect_q;
return 0;
@@ -736,7 +745,12 @@ static int __init nvme_loop_init_module(void)
ret = nvmet_register_transport(&nvme_loop_ops);
if (ret)
return ret;
- return nvmf_register_transport(&nvme_loop_transport);
+
+ ret = nvmf_register_transport(&nvme_loop_transport);
+ if (ret)
+ nvmet_unregister_transport(&nvme_loop_ops);
+
+ return ret;
}
static void __exit nvme_loop_cleanup_module(void)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index f7ff15f17ca97..7cb77ba5993b9 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -253,11 +253,11 @@ struct nvmet_async_event {
u8 log_page;
};
-int nvmet_parse_connect_cmd(struct nvmet_req *req);
-int nvmet_parse_io_cmd(struct nvmet_req *req);
-int nvmet_parse_admin_cmd(struct nvmet_req *req);
-int nvmet_parse_discovery_cmd(struct nvmet_req *req);
-int nvmet_parse_fabrics_cmd(struct nvmet_req *req);
+u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
+u16 nvmet_parse_io_cmd(struct nvmet_req *req);
+u16 nvmet_parse_admin_cmd(struct nvmet_req *req);
+u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
+u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req);
bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops);
@@ -278,6 +278,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid,
struct nvmet_req *req, struct nvmet_ctrl **ret);
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl);
+u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd);
struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
enum nvme_subsys_type type);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index ecc4fe8625612..99c69018a35f4 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1199,6 +1199,11 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
}
queue->port = cm_id->context;
+ if (queue->host_qid == 0) {
+ /* Let inflight controller teardown complete */
+ flush_scheduled_work();
+ }
+
ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
if (ret)
goto release_queue;
@@ -1427,12 +1432,16 @@ restart:
static int nvmet_rdma_add_port(struct nvmet_port *port)
{
struct rdma_cm_id *cm_id;
- struct sockaddr_in addr_in;
- u16 port_in;
+ struct sockaddr_storage addr = { };
+ __kernel_sa_family_t af;
int ret;
switch (port->disc_addr.adrfam) {
case NVMF_ADDR_FAMILY_IP4:
+ af = AF_INET;
+ break;
+ case NVMF_ADDR_FAMILY_IP6:
+ af = AF_INET6;
break;
default:
pr_err("address family %d not supported\n",
@@ -1440,13 +1449,13 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
return -EINVAL;
}
- ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in);
- if (ret)
+ ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr,
+ port->disc_addr.trsvcid, &addr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s:%s\n",
+ port->disc_addr.traddr, port->disc_addr.trsvcid);
return ret;
-
- addr_in.sin_family = AF_INET;
- addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr);
- addr_in.sin_port = htons(port_in);
+ }
cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
RDMA_PS_TCP, IB_QPT_RC);
@@ -1455,20 +1464,32 @@ static int nvmet_rdma_add_port(struct nvmet_port *port)
return PTR_ERR(cm_id);
}
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in);
+ /*
+ * Allow both IPv4 and IPv6 sockets to bind a single port
+ * at the same time.
+ */
+ ret = rdma_set_afonly(cm_id, 1);
+ if (ret) {
+ pr_err("rdma_set_afonly failed (%d)\n", ret);
+ goto out_destroy_id;
+ }
+
+ ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr);
if (ret) {
- pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret);
+ pr_err("binding CM ID to %pISpcs failed (%d)\n",
+ (struct sockaddr *)&addr, ret);
goto out_destroy_id;
}
ret = rdma_listen(cm_id, 128);
if (ret) {
- pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret);
+ pr_err("listening to %pISpcs failed (%d)\n",
+ (struct sockaddr *)&addr, ret);
goto out_destroy_id;
}
- pr_info("enabling port %d (%pISpc)\n",
- le16_to_cpu(port->disc_addr.portid), &addr_in);
+ pr_info("enabling port %d (%pISpcs)\n",
+ le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr);
port->priv = cm_id;
return 0;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 6ff61dad5e215..62fed9dc893ef 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -183,11 +183,33 @@ static void jsfd_read(char *buf, unsigned long p, size_t togo) {
}
}
-static void jsfd_do_request(struct request_queue *q)
+static int jsfd_queue;
+
+static struct request *jsfd_next_request(void)
+{
+ struct request_queue *q;
+ struct request *rq;
+ int old_pos = jsfd_queue;
+
+ do {
+ q = jsfd_disk[jsfd_queue]->queue;
+ if (++jsfd_queue == JSF_MAX)
+ jsfd_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ return rq;
+ }
+ } while (jsfd_queue != old_pos);
+
+ return NULL;
+}
+
+static void jsfd_request(void)
{
struct request *req;
- req = blk_fetch_request(q);
+ req = jsfd_next_request();
while (req) {
struct jsfd_part *jdp = req->rq_disk->private_data;
unsigned long offset = blk_rq_pos(req) << 9;
@@ -211,10 +233,15 @@ static void jsfd_do_request(struct request_queue *q)
err = 0;
end:
if (!__blk_end_request_cur(req, err))
- req = blk_fetch_request(q);
+ req = jsfd_next_request();
}
}
+static void jsfd_do_request(struct request_queue *q)
+{
+ jsfd_request();
+}
+
/*
* The memory devices use the full 32/64 bits of the offset, and so we cannot
* check against negative addresses: they are ok. The return value is weird,
@@ -544,8 +571,6 @@ static int jsflash_init(void)
return 0;
}
-static struct request_queue *jsf_queue;
-
static int jsfd_init(void)
{
static DEFINE_SPINLOCK(lock);
@@ -562,6 +587,11 @@ static int jsfd_init(void)
struct gendisk *disk = alloc_disk(1);
if (!disk)
goto out;
+ disk->queue = blk_init_queue(jsfd_do_request, &lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ goto out;
+ }
jsfd_disk[i] = disk;
}
@@ -570,13 +600,6 @@ static int jsfd_init(void)
goto out;
}
- jsf_queue = blk_init_queue(jsfd_do_request, &lock);
- if (!jsf_queue) {
- err = -ENOMEM;
- unregister_blkdev(JSFD_MAJOR, "jsfd");
- goto out;
- }
-
for (i = 0; i < JSF_MAX; i++) {
struct gendisk *disk = jsfd_disk[i];
if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
@@ -589,7 +612,6 @@ static int jsfd_init(void)
disk->fops = &jsfd_fops;
set_capacity(disk, jdp->dsize >> 9);
disk->private_data = jdp;
- disk->queue = jsf_queue;
add_disk(disk);
set_disk_ro(disk, 1);
}
@@ -619,6 +641,7 @@ static void __exit jsflash_cleanup_module(void)
for (i = 0; i < JSF_MAX; i++) {
if ((i & JSF_PART_MASK) >= JSF_NPART) continue;
del_gendisk(jsfd_disk[i]);
+ blk_cleanup_queue(jsfd_disk[i]->queue);
put_disk(jsfd_disk[i]);
}
if (jsf0.busy)
@@ -628,7 +651,6 @@ static void __exit jsflash_cleanup_module(void)
misc_deregister(&jsf_dev);
unregister_blkdev(JSFD_MAJOR, "jsfd");
- blk_cleanup_queue(jsf_queue);
}
module_init(jsflash_init_module);
diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile
index fc2855565a51f..93dbe58c47c84 100644
--- a/drivers/scsi/Makefile
+++ b/drivers/scsi/Makefile
@@ -166,6 +166,7 @@ scsi_mod-y += scsi_scan.o scsi_sysfs.o scsi_devinfo.o
scsi_mod-$(CONFIG_SCSI_NETLINK) += scsi_netlink.o
scsi_mod-$(CONFIG_SYSCTL) += scsi_sysctl.o
scsi_mod-$(CONFIG_SCSI_PROC_FS) += scsi_proc.o
+scsi_mod-$(CONFIG_BLK_DEBUG_FS) += scsi_debugfs.o
scsi_mod-y += scsi_trace.o scsi_logging.o
scsi_mod-$(CONFIG_PM) += scsi_pm.o
scsi_mod-$(CONFIG_SCSI_DH) += scsi_dh.o
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index acba1b67e505e..b2333b3889c72 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -408,9 +408,7 @@ out:
if (phba->ktime_on)
lpfc_nvmet_ktime(phba, ctxp);
#endif
- /* Let Abort cmpl repost the context */
- if (!(ctxp->flag & LPFC_NVMET_ABORT_OP))
- lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+ /* lpfc_nvmet_xmt_fcp_release() will recycle the context */
} else {
ctxp->entry_cnt++;
start_clean = offsetof(struct lpfc_iocbq, wqe);
@@ -544,27 +542,6 @@ lpfc_nvmet_xmt_fcp_op(struct nvmet_fc_target_port *tgtport,
}
#endif
- if (rsp->op == NVMET_FCOP_ABORT) {
- lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
- "6103 Abort op: oxri x%x %d cnt %d\n",
- ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
- lpfc_nvmeio_data(phba, "NVMET FCP ABRT: "
- "xri x%x state x%x cnt x%x\n",
- ctxp->oxid, ctxp->state, ctxp->entry_cnt);
-
- atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
- ctxp->entry_cnt++;
- ctxp->flag |= LPFC_NVMET_ABORT_OP;
- if (ctxp->flag & LPFC_NVMET_IO_INP)
- lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
- ctxp->oxid);
- else
- lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
- ctxp->oxid);
- return 0;
- }
-
/* Sanity check */
if (ctxp->state == LPFC_NVMET_STE_ABORT) {
atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
@@ -634,10 +611,75 @@ lpfc_nvmet_targetport_delete(struct nvmet_fc_target_port *targetport)
complete(&tport->tport_unreg_done);
}
+static void
+lpfc_nvmet_xmt_fcp_abort(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *req)
+{
+ struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+ struct lpfc_nvmet_rcv_ctx *ctxp =
+ container_of(req, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+ struct lpfc_hba *phba = ctxp->phba;
+
+ lpfc_printf_log(phba, KERN_INFO, LOG_NVME_ABTS,
+ "6103 Abort op: oxri x%x %d cnt %d\n",
+ ctxp->oxid, ctxp->state, ctxp->entry_cnt);
+
+ lpfc_nvmeio_data(phba, "NVMET FCP ABRT: xri x%x state x%x cnt x%x\n",
+ ctxp->oxid, ctxp->state, ctxp->entry_cnt);
+
+ atomic_inc(&lpfc_nvmep->xmt_fcp_abort);
+ ctxp->entry_cnt++;
+ ctxp->flag |= LPFC_NVMET_ABORT_OP;
+ if (ctxp->flag & LPFC_NVMET_IO_INP)
+ lpfc_nvmet_sol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+ ctxp->oxid);
+ else
+ lpfc_nvmet_unsol_fcp_issue_abort(phba, ctxp, ctxp->sid,
+ ctxp->oxid);
+}
+
+static void
+lpfc_nvmet_xmt_fcp_release(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *rsp)
+{
+ struct lpfc_nvmet_tgtport *lpfc_nvmep = tgtport->private;
+ struct lpfc_nvmet_rcv_ctx *ctxp =
+ container_of(rsp, struct lpfc_nvmet_rcv_ctx, ctx.fcp_req);
+ struct lpfc_hba *phba = ctxp->phba;
+ unsigned long flags;
+ bool aborting = false;
+
+ spin_lock_irqsave(&ctxp->ctxlock, flags);
+ if (ctxp->flag & LPFC_NVMET_ABORT_OP) {
+ aborting = true;
+ ctxp->flag |= LPFC_NVMET_CTX_RLS;
+ }
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+ if (aborting)
+ /* let the abort path do the real release */
+ return;
+
+ /* Sanity check */
+ if (ctxp->state != LPFC_NVMET_STE_DONE) {
+ atomic_inc(&lpfc_nvmep->xmt_fcp_drop);
+ lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
+ "6117 Bad state IO x%x aborted\n",
+ ctxp->oxid);
+ }
+
+ lpfc_nvmeio_data(phba, "NVMET FCP FREE: xri x%x ste %d\n", ctxp->oxid,
+ ctxp->state, 0);
+
+ lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+}
+
static struct nvmet_fc_target_template lpfc_tgttemplate = {
.targetport_delete = lpfc_nvmet_targetport_delete,
.xmt_ls_rsp = lpfc_nvmet_xmt_ls_rsp,
.fcp_op = lpfc_nvmet_xmt_fcp_op,
+ .fcp_abort = lpfc_nvmet_xmt_fcp_abort,
+ .fcp_req_release = lpfc_nvmet_xmt_fcp_release,
.max_hw_queues = 1,
.max_sgl_segments = LPFC_NVMET_DEFAULT_SEGS,
@@ -669,7 +711,9 @@ lpfc_nvmet_create_targetport(struct lpfc_hba *phba)
lpfc_tgttemplate.max_hw_queues = phba->cfg_nvme_io_channel;
lpfc_tgttemplate.max_sgl_segments = phba->cfg_sg_seg_cnt;
lpfc_tgttemplate.target_features = NVMET_FCTGTFEAT_READDATA_RSP |
- NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED;
+ NVMET_FCTGTFEAT_NEEDS_CMD_CPUSCHED |
+ NVMET_FCTGTFEAT_CMD_IN_ISR |
+ NVMET_FCTGTFEAT_OPDONE_IN_ISR;
#if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
error = nvmet_fc_register_targetport(&pinfo, &lpfc_tgttemplate,
@@ -832,6 +876,7 @@ dropit:
ctxp->wqeq = NULL;
ctxp->state = LPFC_NVMET_STE_RCV;
ctxp->rqb_buffer = (void *)nvmebuf;
+ spin_lock_init(&ctxp->ctxlock);
lpfc_nvmeio_data(phba, "NVMET LS RCV: xri x%x sz %d from %06x\n",
oxid, size, sid);
@@ -1593,6 +1638,8 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
struct lpfc_nvmet_rcv_ctx *ctxp;
struct lpfc_nvmet_tgtport *tgtp;
uint32_t status, result;
+ unsigned long flags;
+ bool released = false;
ctxp = cmdwqe->context2;
status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1607,7 +1654,18 @@ lpfc_nvmet_sol_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
result, wcqe->word3);
ctxp->state = LPFC_NVMET_STE_DONE;
- lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+ spin_lock_irqsave(&ctxp->ctxlock, flags);
+ if (ctxp->flag & LPFC_NVMET_CTX_RLS)
+ released = true;
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+ /*
+ * if transport has released ctx, then can reuse it. Otherwise,
+ * will be recycled by transport release call.
+ */
+ if (released)
+ lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
cmdwqe->context2 = NULL;
cmdwqe->context3 = NULL;
@@ -1630,7 +1688,9 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
{
struct lpfc_nvmet_rcv_ctx *ctxp;
struct lpfc_nvmet_tgtport *tgtp;
+ unsigned long flags;
uint32_t status, result;
+ bool released = false;
ctxp = cmdwqe->context2;
status = bf_get(lpfc_wcqe_c_status, wcqe);
@@ -1652,7 +1712,19 @@ lpfc_nvmet_xmt_fcp_abort_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
ctxp->state, ctxp->oxid);
}
ctxp->state = LPFC_NVMET_STE_DONE;
- lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+ spin_lock_irqsave(&ctxp->ctxlock, flags);
+ if (ctxp->flag & LPFC_NVMET_CTX_RLS)
+ released = true;
+ ctxp->flag &= ~LPFC_NVMET_ABORT_OP;
+ spin_unlock_irqrestore(&ctxp->ctxlock, flags);
+
+ /*
+ * if transport has released ctx, then can reuse it. Otherwise,
+ * will be recycled by transport release call.
+ */
+ if (released)
+ lpfc_nvmet_rq_post(phba, ctxp, &ctxp->rqb_buffer->hbuf);
+
cmdwqe->context2 = NULL;
cmdwqe->context3 = NULL;
}
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.h b/drivers/scsi/lpfc/lpfc_nvmet.h
index ca96f05c1604f..02735fc6fd411 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.h
+++ b/drivers/scsi/lpfc/lpfc_nvmet.h
@@ -81,6 +81,7 @@ struct lpfc_nvmet_rcv_ctx {
struct lpfc_iocbq *wqeq;
struct lpfc_iocbq *abort_wqeq;
dma_addr_t txrdy_phys;
+ spinlock_t ctxlock; /* protect flag access */
uint32_t *txrdy;
uint32_t sid;
uint32_t offset;
@@ -97,8 +98,10 @@ struct lpfc_nvmet_rcv_ctx {
#define LPFC_NVMET_STE_RSP 4
#define LPFC_NVMET_STE_DONE 5
uint16_t flag;
-#define LPFC_NVMET_IO_INP 1
-#define LPFC_NVMET_ABORT_OP 2
+#define LPFC_NVMET_IO_INP 0x1
+#define LPFC_NVMET_ABORT_OP 0x2
+#define LPFC_NVMET_CTX_RLS 0x4
+
struct rqb_dmabuf *rqb_buffer;
#ifdef CONFIG_SCSI_LPFC_DEBUG_FS
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 6903f03c88af4..8a1b948164191 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -477,7 +477,7 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
int error)
{
or->async_error = error;
- or->req_errors = req->errors ? : error;
+ or->req_errors = scsi_req(req)->result ? : error;
or->sense_len = scsi_req(req)->sense_len;
if (or->sense_len)
memcpy(or->sense, scsi_req(req)->sense, or->sense_len);
@@ -489,7 +489,10 @@ static void _set_error_resid(struct osd_request *or, struct request *req,
int osd_execute_request(struct osd_request *or)
{
- int error = blk_execute_rq(or->request->q, NULL, or->request, 0);
+ int error;
+
+ blk_execute_rq(or->request->q, NULL, or->request, 0);
+ error = scsi_req(or->request)->result ? -EIO : 0;
_set_error_resid(or, or->request, error);
return error;
@@ -1602,7 +1605,7 @@ static int _init_blk_request(struct osd_request *or,
req->rq_flags |= RQF_QUIET;
req->timeout = or->timeout;
- req->retries = or->retries;
+ scsi_req(req)->retries = or->retries;
if (has_out) {
or->out.req = req;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index c47f4b349bac4..67cbed92f07dd 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -327,7 +327,7 @@ static void osst_end_async(struct request *req, int update)
struct osst_tape *STp = SRpnt->stp;
struct rq_map_data *mdata = &SRpnt->stp->buffer->map_data;
- STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+ STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
#if DEBUG
STp->write_pending = 0;
#endif
@@ -414,7 +414,7 @@ static int osst_execute(struct osst_request *SRpnt, const unsigned char *cmd,
memset(rq->cmd, 0, BLK_MAX_CDB); /* ATAPI hates garbage after CDB */
memcpy(rq->cmd, cmd, rq->cmd_len);
req->timeout = timeout;
- req->retries = retries;
+ rq->retries = retries;
req->end_io_data = SRpnt;
blk_execute_rq_nowait(req->q, NULL, req, 1, osst_end_async);
diff --git a/drivers/scsi/qla2xxx/qla_bsg.c b/drivers/scsi/qla2xxx/qla_bsg.c
index 84c9098cc089f..b6e40fd4c3c1a 100644
--- a/drivers/scsi/qla2xxx/qla_bsg.c
+++ b/drivers/scsi/qla2xxx/qla_bsg.c
@@ -2553,13 +2553,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
ql_log(ql_log_warn, vha, 0x7089,
"mbx abort_command "
"failed.\n");
- bsg_job->req->errors =
+ scsi_req(bsg_job->req)->result =
bsg_reply->result = -EIO;
} else {
ql_dbg(ql_dbg_user, vha, 0x708a,
"mbx abort_command "
"success.\n");
- bsg_job->req->errors =
+ scsi_req(bsg_job->req)->result =
bsg_reply->result = 0;
}
spin_lock_irqsave(&ha->hardware_lock, flags);
@@ -2570,7 +2570,7 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
}
spin_unlock_irqrestore(&ha->hardware_lock, flags);
ql_log(ql_log_info, vha, 0x708b, "SRB not found to abort.\n");
- bsg_job->req->errors = bsg_reply->result = -ENXIO;
+ scsi_req(bsg_job->req)->result = bsg_reply->result = -ENXIO;
return 0;
done:
diff --git a/drivers/scsi/scsi_debugfs.c b/drivers/scsi/scsi_debugfs.c
new file mode 100644
index 0000000000000..a97c9507103d2
--- /dev/null
+++ b/drivers/scsi/scsi_debugfs.c
@@ -0,0 +1,13 @@
+#include <linux/seq_file.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_dbg.h>
+#include "scsi_debugfs.h"
+
+void scsi_show_rq(struct seq_file *m, struct request *rq)
+{
+ struct scsi_cmnd *cmd = container_of(scsi_req(rq), typeof(*cmd), req);
+ char buf[80];
+
+ __scsi_format_command(buf, sizeof(buf), cmd->cmnd, cmd->cmd_len);
+ seq_printf(m, ", .cmd=%s", buf);
+}
diff --git a/drivers/scsi/scsi_debugfs.h b/drivers/scsi/scsi_debugfs.h
new file mode 100644
index 0000000000000..951b043e82d07
--- /dev/null
+++ b/drivers/scsi/scsi_debugfs.h
@@ -0,0 +1,4 @@
+struct request;
+struct seq_file;
+
+void scsi_show_rq(struct seq_file *m, struct request *rq);
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index f2cafae150bcd..2db412dd4b447 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1988,7 +1988,7 @@ static void scsi_eh_lock_door(struct scsi_device *sdev)
req->rq_flags |= RQF_QUIET;
req->timeout = 10 * HZ;
- req->retries = 5;
+ rq->retries = 5;
blk_execute_rq_nowait(req->q, NULL, req, 1, eh_lock_door_done);
}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 15c9fe766071a..1c3e87d6c48f1 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -34,6 +34,7 @@
#include <trace/events/scsi.h>
+#include "scsi_debugfs.h"
#include "scsi_priv.h"
#include "scsi_logging.h"
@@ -229,8 +230,8 @@ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason)
* @rq_flags: flags for ->rq_flags
* @resid: optional residual length
*
- * returns the req->errors value which is the scsi_cmnd result
- * field.
+ * Returns the scsi_cmnd result field if a command was executed, or a negative
+ * Linux error code if we didn't get that far.
*/
int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
int data_direction, void *buffer, unsigned bufflen,
@@ -256,7 +257,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
rq->cmd_len = COMMAND_SIZE(cmd[0]);
memcpy(rq->cmd, cmd, rq->cmd_len);
- req->retries = retries;
+ rq->retries = retries;
req->timeout = timeout;
req->cmd_flags |= flags;
req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT;
@@ -281,7 +282,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE);
if (sshdr)
scsi_normalize_sense(rq->sense, rq->sense_len, sshdr);
- ret = req->errors;
+ ret = rq->result;
out:
blk_put_request(req);
@@ -797,8 +798,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
/*
* __scsi_error_from_host_byte may have reset the host_byte
*/
- req->errors = cmd->result;
-
+ scsi_req(req)->result = cmd->result;
scsi_req(req)->resid_len = scsi_get_resid(cmd);
if (scsi_bidi_cmnd(cmd)) {
@@ -835,7 +835,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
/*
* Recovered errors need reporting, but they're always treated as
* success, so fiddle the result code here. For passthrough requests
- * we already took a copy of the original into rq->errors which
+ * we already took a copy of the original into sreq->result which
* is what gets returned to the user
*/
if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) {
@@ -1177,7 +1177,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
cmd->cmd_len = scsi_req(req)->cmd_len;
cmd->cmnd = scsi_req(req)->cmd;
cmd->transfersize = blk_rq_bytes(req);
- cmd->allowed = req->retries;
+ cmd->allowed = scsi_req(req)->retries;
return BLKPREP_OK;
}
@@ -1281,7 +1281,7 @@ scsi_prep_return(struct request_queue *q, struct request *req, int ret)
switch (ret) {
case BLKPREP_KILL:
case BLKPREP_INVALID:
- req->errors = DID_NO_CONNECT << 16;
+ scsi_req(req)->result = DID_NO_CONNECT << 16;
/* release the command and kill it */
if (req->special) {
struct scsi_cmnd *cmd = req->special;
@@ -1905,7 +1905,7 @@ static int scsi_mq_prep_fn(struct request *req)
static void scsi_mq_done(struct scsi_cmnd *cmd)
{
trace_scsi_dispatch_cmd_done(cmd);
- blk_mq_complete_request(cmd->request, cmd->request->errors);
+ blk_mq_complete_request(cmd->request);
}
static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -2154,10 +2154,13 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
return q;
}
-static struct blk_mq_ops scsi_mq_ops = {
+static const struct blk_mq_ops scsi_mq_ops = {
.queue_rq = scsi_queue_rq,
.complete = scsi_softirq_done,
.timeout = scsi_timeout,
+#ifdef CONFIG_BLK_DEBUG_FS
+ .show_rq = scsi_show_rq,
+#endif
.init_request = scsi_init_request,
.exit_request = scsi_exit_request,
.map_queues = scsi_map_queues,
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index cdbb293aca08f..9fdbd50c31b45 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -184,9 +184,9 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
blk_rq_bytes(req->next_rq);
handler = to_sas_internal(shost->transportt)->f->smp_handler;
ret = handler(shost, rphy, req);
- req->errors = ret;
+ scsi_req(req)->result = ret;
- blk_end_request_all(req, ret);
+ blk_end_request_all(req, 0);
spin_lock_irq(q->queue_lock);
}
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 35ad5e8a31ab3..0dc95e102e697 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -418,6 +418,46 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(provisioning_mode);
+static const char *zeroing_mode[] = {
+ [SD_ZERO_WRITE] = "write",
+ [SD_ZERO_WS] = "writesame",
+ [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap",
+ [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap",
+};
+
+static ssize_t
+zeroing_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+ return snprintf(buf, 20, "%s\n", zeroing_mode[sdkp->zeroing_mode]);
+}
+
+static ssize_t
+zeroing_mode_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct scsi_disk *sdkp = to_scsi_disk(dev);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!strncmp(buf, zeroing_mode[SD_ZERO_WRITE], 20))
+ sdkp->zeroing_mode = SD_ZERO_WRITE;
+ else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS], 20))
+ sdkp->zeroing_mode = SD_ZERO_WS;
+ else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS16_UNMAP], 20))
+ sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+ else if (!strncmp(buf, zeroing_mode[SD_ZERO_WS10_UNMAP], 20))
+ sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+ else
+ return -EINVAL;
+
+ return count;
+}
+static DEVICE_ATTR_RW(zeroing_mode);
+
static ssize_t
max_medium_access_timeouts_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -496,6 +536,7 @@ static struct attribute *sd_disk_attrs[] = {
&dev_attr_app_tag_own.attr,
&dev_attr_thin_provisioning.attr,
&dev_attr_provisioning_mode.attr,
+ &dev_attr_zeroing_mode.attr,
&dev_attr_max_write_same_blocks.attr,
&dev_attr_max_medium_access_timeouts.attr,
NULL,
@@ -644,26 +685,11 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
unsigned int logical_block_size = sdkp->device->sector_size;
unsigned int max_blocks = 0;
- q->limits.discard_zeroes_data = 0;
-
- /*
- * When LBPRZ is reported, discard alignment and granularity
- * must be fixed to the logical block size. Otherwise the block
- * layer will drop misaligned portions of the request which can
- * lead to data corruption. If LBPRZ is not set, we honor the
- * device preference.
- */
- if (sdkp->lbprz) {
- q->limits.discard_alignment = 0;
- q->limits.discard_granularity = logical_block_size;
- } else {
- q->limits.discard_alignment = sdkp->unmap_alignment *
- logical_block_size;
- q->limits.discard_granularity =
- max(sdkp->physical_block_size,
- sdkp->unmap_granularity * logical_block_size);
- }
-
+ q->limits.discard_alignment =
+ sdkp->unmap_alignment * logical_block_size;
+ q->limits.discard_granularity =
+ max(sdkp->physical_block_size,
+ sdkp->unmap_granularity * logical_block_size);
sdkp->provisioning_mode = mode;
switch (mode) {
@@ -681,19 +707,16 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
case SD_LBP_WS16:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS16_BLOCKS);
- q->limits.discard_zeroes_data = sdkp->lbprz;
break;
case SD_LBP_WS10:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS10_BLOCKS);
- q->limits.discard_zeroes_data = sdkp->lbprz;
break;
case SD_LBP_ZERO:
max_blocks = min_not_zero(sdkp->max_ws_blocks,
(u32)SD_MAX_WS10_BLOCKS);
- q->limits.discard_zeroes_data = 1;
break;
}
@@ -701,93 +724,122 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
}
-/**
- * sd_setup_discard_cmnd - unmap blocks on thinly provisioned device
- * @sdp: scsi device to operate on
- * @rq: Request to prepare
- *
- * Will issue either UNMAP or WRITE SAME(16) depending on preference
- * indicated by target device.
- **/
-static int sd_setup_discard_cmnd(struct scsi_cmnd *cmd)
+static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
{
- struct request *rq = cmd->request;
struct scsi_device *sdp = cmd->device;
- struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
- sector_t sector = blk_rq_pos(rq);
- unsigned int nr_sectors = blk_rq_sectors(rq);
- unsigned int len;
- int ret;
+ struct request *rq = cmd->request;
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+ unsigned int data_len = 24;
char *buf;
- struct page *page;
-
- sector >>= ilog2(sdp->sector_size) - 9;
- nr_sectors >>= ilog2(sdp->sector_size) - 9;
- page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
- if (!page)
+ rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!rq->special_vec.bv_page)
return BLKPREP_DEFER;
+ rq->special_vec.bv_offset = 0;
+ rq->special_vec.bv_len = data_len;
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- switch (sdkp->provisioning_mode) {
- case SD_LBP_UNMAP:
- buf = page_address(page);
+ cmd->cmd_len = 10;
+ cmd->cmnd[0] = UNMAP;
+ cmd->cmnd[8] = 24;
- cmd->cmd_len = 10;
- cmd->cmnd[0] = UNMAP;
- cmd->cmnd[8] = 24;
+ buf = page_address(rq->special_vec.bv_page);
+ put_unaligned_be16(6 + 16, &buf[0]);
+ put_unaligned_be16(16, &buf[2]);
+ put_unaligned_be64(sector, &buf[8]);
+ put_unaligned_be32(nr_sectors, &buf[16]);
- put_unaligned_be16(6 + 16, &buf[0]);
- put_unaligned_be16(16, &buf[2]);
- put_unaligned_be64(sector, &buf[8]);
- put_unaligned_be32(nr_sectors, &buf[16]);
+ cmd->allowed = SD_MAX_RETRIES;
+ cmd->transfersize = data_len;
+ rq->timeout = SD_TIMEOUT;
+ scsi_req(rq)->resid_len = data_len;
- len = 24;
- break;
+ return scsi_init_io(cmd);
+}
- case SD_LBP_WS16:
- cmd->cmd_len = 16;
- cmd->cmnd[0] = WRITE_SAME_16;
+static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
+{
+ struct scsi_device *sdp = cmd->device;
+ struct request *rq = cmd->request;
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 data_len = sdp->sector_size;
+
+ rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!rq->special_vec.bv_page)
+ return BLKPREP_DEFER;
+ rq->special_vec.bv_offset = 0;
+ rq->special_vec.bv_len = data_len;
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+ cmd->cmd_len = 16;
+ cmd->cmnd[0] = WRITE_SAME_16;
+ if (unmap)
cmd->cmnd[1] = 0x8; /* UNMAP */
- put_unaligned_be64(sector, &cmd->cmnd[2]);
- put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
+ put_unaligned_be64(sector, &cmd->cmnd[2]);
+ put_unaligned_be32(nr_sectors, &cmd->cmnd[10]);
- len = sdkp->device->sector_size;
- break;
+ cmd->allowed = SD_MAX_RETRIES;
+ cmd->transfersize = data_len;
+ rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+ scsi_req(rq)->resid_len = data_len;
- case SD_LBP_WS10:
- case SD_LBP_ZERO:
- cmd->cmd_len = 10;
- cmd->cmnd[0] = WRITE_SAME;
- if (sdkp->provisioning_mode == SD_LBP_WS10)
- cmd->cmnd[1] = 0x8; /* UNMAP */
- put_unaligned_be32(sector, &cmd->cmnd[2]);
- put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
+ return scsi_init_io(cmd);
+}
- len = sdkp->device->sector_size;
- break;
+static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
+{
+ struct scsi_device *sdp = cmd->device;
+ struct request *rq = cmd->request;
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 data_len = sdp->sector_size;
- default:
- ret = BLKPREP_INVALID;
- goto out;
- }
+ rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!rq->special_vec.bv_page)
+ return BLKPREP_DEFER;
+ rq->special_vec.bv_offset = 0;
+ rq->special_vec.bv_len = data_len;
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- rq->timeout = SD_TIMEOUT;
+ cmd->cmd_len = 10;
+ cmd->cmnd[0] = WRITE_SAME;
+ if (unmap)
+ cmd->cmnd[1] = 0x8; /* UNMAP */
+ put_unaligned_be32(sector, &cmd->cmnd[2]);
+ put_unaligned_be16(nr_sectors, &cmd->cmnd[7]);
- cmd->transfersize = len;
cmd->allowed = SD_MAX_RETRIES;
+ cmd->transfersize = data_len;
+ rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT;
+ scsi_req(rq)->resid_len = data_len;
- rq->special_vec.bv_page = page;
- rq->special_vec.bv_offset = 0;
- rq->special_vec.bv_len = len;
+ return scsi_init_io(cmd);
+}
- rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- scsi_req(rq)->resid_len = len;
+static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+{
+ struct request *rq = cmd->request;
+ struct scsi_device *sdp = cmd->device;
+ struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
+ u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9);
+ u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9);
+
+ if (!(rq->cmd_flags & REQ_NOUNMAP)) {
+ switch (sdkp->zeroing_mode) {
+ case SD_ZERO_WS16_UNMAP:
+ return sd_setup_write_same16_cmnd(cmd, true);
+ case SD_ZERO_WS10_UNMAP:
+ return sd_setup_write_same10_cmnd(cmd, true);
+ }
+ }
- ret = scsi_init_io(cmd);
-out:
- if (ret != BLKPREP_OK)
- __free_page(page);
- return ret;
+ if (sdp->no_write_same)
+ return BLKPREP_INVALID;
+ if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
+ return sd_setup_write_same16_cmnd(cmd, false);
+ return sd_setup_write_same10_cmnd(cmd, false);
}
static void sd_config_write_same(struct scsi_disk *sdkp)
@@ -816,9 +868,20 @@ static void sd_config_write_same(struct scsi_disk *sdkp)
sdkp->max_ws_blocks = 0;
}
+ if (sdkp->lbprz && sdkp->lbpws)
+ sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP;
+ else if (sdkp->lbprz && sdkp->lbpws10)
+ sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP;
+ else if (sdkp->max_ws_blocks)
+ sdkp->zeroing_mode = SD_ZERO_WS;
+ else
+ sdkp->zeroing_mode = SD_ZERO_WRITE;
+
out:
blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks *
(logical_block_size >> 9));
+ blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks *
+ (logical_block_size >> 9));
}
/**
@@ -1155,7 +1218,20 @@ static int sd_init_command(struct scsi_cmnd *cmd)
switch (req_op(rq)) {
case REQ_OP_DISCARD:
- return sd_setup_discard_cmnd(cmd);
+ switch (scsi_disk(rq->rq_disk)->provisioning_mode) {
+ case SD_LBP_UNMAP:
+ return sd_setup_unmap_cmnd(cmd);
+ case SD_LBP_WS16:
+ return sd_setup_write_same16_cmnd(cmd, true);
+ case SD_LBP_WS10:
+ return sd_setup_write_same10_cmnd(cmd, true);
+ case SD_LBP_ZERO:
+ return sd_setup_write_same10_cmnd(cmd, false);
+ default:
+ return BLKPREP_INVALID;
+ }
+ case REQ_OP_WRITE_ZEROES:
+ return sd_setup_write_zeroes_cmnd(cmd);
case REQ_OP_WRITE_SAME:
return sd_setup_write_same_cmnd(cmd);
case REQ_OP_FLUSH:
@@ -1795,6 +1871,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
switch (req_op(req)) {
case REQ_OP_DISCARD:
+ case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_ZONE_RESET:
if (!result) {
@@ -2768,7 +2845,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
sd_config_discard(sdkp, SD_LBP_WS16);
} else { /* LBP VPD page tells us what to use */
- if (sdkp->lbpu && sdkp->max_unmap_blocks && !sdkp->lbprz)
+ if (sdkp->lbpu && sdkp->max_unmap_blocks)
sd_config_discard(sdkp, SD_LBP_UNMAP);
else if (sdkp->lbpws)
sd_config_discard(sdkp, SD_LBP_WS16);
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 4dac35e96a75b..a2c4b5c35379d 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -59,6 +59,13 @@ enum {
SD_LBP_DISABLE, /* Discard disabled due to failed cmd */
};
+enum {
+ SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */
+ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */
+ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */
+ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */
+};
+
struct scsi_disk {
struct scsi_driver *driver; /* always &sd_template */
struct scsi_device *device;
@@ -89,6 +96,7 @@ struct scsi_disk {
u8 write_prot;
u8 protection_type;/* Data Integrity Field */
u8 provisioning_mode;
+ u8 zeroing_mode;
unsigned ATO : 1; /* state of disk ATO bit */
unsigned cache_override : 1; /* temp override of WCE,RCD */
unsigned WCE : 1; /* state of disk WCE bit */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 92620c8ea8ad9..1994f7799fced 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -329,6 +329,7 @@ void sd_zbc_complete(struct scsi_cmnd *cmd,
switch (req_op(rq)) {
case REQ_OP_WRITE:
+ case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_ZONE_RESET:
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 225abaad4d1cc..504504beaa5e0 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1300,7 +1300,7 @@ sg_rq_end_io(struct request *rq, int uptodate)
pr_info("%s: device detaching\n", __func__);
sense = req->sense;
- result = rq->errors;
+ result = req->result;
resid = req->resid_len;
SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp,
@@ -1718,7 +1718,7 @@ sg_start_req(Sg_request *srp, unsigned char *cmd)
srp->rq = rq;
rq->end_io_data = srp;
- rq->retries = SG_DEFAULT_RETRIES;
+ req->retries = SG_DEFAULT_RETRIES;
if ((dxfer_len <= 0) || (dxfer_dir == SG_DXFER_NONE))
return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index e5ef78a6848ef..1ea34d6f54370 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -480,7 +480,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
atomic64_add(ktime_to_ns(now), &STp->stats->tot_write_time);
atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
atomic64_inc(&STp->stats->write_cnt);
- if (req->errors) {
+ if (scsi_req(req)->result) {
atomic64_add(atomic_read(&STp->stats->last_write_size)
- STp->buffer->cmdstat.residual,
&STp->stats->write_byte_cnt);
@@ -494,7 +494,7 @@ static void st_do_stats(struct scsi_tape *STp, struct request *req)
atomic64_add(ktime_to_ns(now), &STp->stats->tot_read_time);
atomic64_add(ktime_to_ns(now), &STp->stats->tot_io_time);
atomic64_inc(&STp->stats->read_cnt);
- if (req->errors) {
+ if (scsi_req(req)->result) {
atomic64_add(atomic_read(&STp->stats->last_read_size)
- STp->buffer->cmdstat.residual,
&STp->stats->read_byte_cnt);
@@ -518,7 +518,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
struct scsi_tape *STp = SRpnt->stp;
struct bio *tmp;
- STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
+ STp->buffer->cmdstat.midlevel_result = SRpnt->result = rq->result;
STp->buffer->cmdstat.residual = rq->resid_len;
st_do_stats(STp, req);
@@ -579,7 +579,7 @@ static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
memset(rq->cmd, 0, BLK_MAX_CDB);
memcpy(rq->cmd, cmd, rq->cmd_len);
req->timeout = timeout;
- req->retries = retries;
+ rq->retries = retries;
req->end_io_data = SRpnt;
blk_execute_rq_nowait(req->q, NULL, req, 1, st_scsi_execute_end);
diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h
index 8886458748c13..a676bccabd436 100644
--- a/drivers/staging/lustre/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -133,13 +133,9 @@ struct lustre_sb_info {
struct obd_export *lsi_osd_exp;
char lsi_osd_type[16];
char lsi_fstype[16];
- struct backing_dev_info lsi_bdi; /* each client mountpoint needs
- * own backing_dev_info
- */
};
#define LSI_UMOUNT_FAILOVER 0x00200000
-#define LSI_BDI_INITIALIZED 0x00400000
#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info))
#define s2lsi_nocast(sb) ((sb)->s_fs_info)
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index b229cbc7bb334..d483c44aafe5d 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -863,15 +863,6 @@ void ll_lli_init(struct ll_inode_info *lli)
mutex_init(&lli->lli_layout_mutex);
}
-static inline int ll_bdi_register(struct backing_dev_info *bdi)
-{
- static atomic_t ll_bdi_num = ATOMIC_INIT(0);
-
- bdi->name = "lustre";
- return bdi_register(bdi, NULL, "lustre-%d",
- atomic_inc_return(&ll_bdi_num));
-}
-
int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
{
struct lustre_profile *lprof = NULL;
@@ -881,6 +872,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
char *profilenm = get_profile_name(sb);
struct config_llog_instance *cfg;
int err;
+ static atomic_t ll_bdi_num = ATOMIC_INIT(0);
CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
@@ -903,16 +895,11 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
if (err)
goto out_free;
- err = bdi_init(&lsi->lsi_bdi);
- if (err)
- goto out_free;
- lsi->lsi_flags |= LSI_BDI_INITIALIZED;
- lsi->lsi_bdi.capabilities = 0;
- err = ll_bdi_register(&lsi->lsi_bdi);
+ err = super_setup_bdi_name(sb, "lustre-%d",
+ atomic_inc_return(&ll_bdi_num));
if (err)
goto out_free;
- sb->s_bdi = &lsi->lsi_bdi;
/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
sb->s_d_op = &ll_d_ops;
@@ -1033,11 +1020,6 @@ void ll_put_super(struct super_block *sb)
if (profilenm)
class_del_profile(profilenm);
- if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
- bdi_destroy(&lsi->lsi_bdi);
- lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
- }
-
ll_free_sbi(sb);
lsi->lsi_llsbi = NULL;
diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c
index 344e8448869c1..5798810197ecf 100644
--- a/drivers/target/iscsi/iscsi_target_configfs.c
+++ b/drivers/target/iscsi/iscsi_target_configfs.c
@@ -167,10 +167,7 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
struct iscsi_portal_group *tpg;
struct iscsi_tpg_np *tpg_np;
char *str, *str2, *ip_str, *port_str;
- struct sockaddr_storage sockaddr;
- struct sockaddr_in *sock_in;
- struct sockaddr_in6 *sock_in6;
- unsigned long port;
+ struct sockaddr_storage sockaddr = { };
int ret;
char buf[MAX_PORTAL_LEN + 1];
@@ -182,21 +179,19 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
memset(buf, 0, MAX_PORTAL_LEN + 1);
snprintf(buf, MAX_PORTAL_LEN + 1, "%s", name);
- memset(&sockaddr, 0, sizeof(struct sockaddr_storage));
-
str = strstr(buf, "[");
if (str) {
- const char *end;
-
str2 = strstr(str, "]");
if (!str2) {
pr_err("Unable to locate trailing \"]\""
" in IPv6 iSCSI network portal address\n");
return ERR_PTR(-EINVAL);
}
- str++; /* Skip over leading "[" */
+
+ ip_str = str + 1; /* Skip over leading "[" */
*str2 = '\0'; /* Terminate the unbracketed IPv6 address */
str2++; /* Skip over the \0 */
+
port_str = strstr(str2, ":");
if (!port_str) {
pr_err("Unable to locate \":port\""
@@ -205,23 +200,8 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
}
*port_str = '\0'; /* Terminate string for IP */
port_str++; /* Skip over ":" */
-
- ret = kstrtoul(port_str, 0, &port);
- if (ret < 0) {
- pr_err("kstrtoul() failed for port_str: %d\n", ret);
- return ERR_PTR(ret);
- }
- sock_in6 = (struct sockaddr_in6 *)&sockaddr;
- sock_in6->sin6_family = AF_INET6;
- sock_in6->sin6_port = htons((unsigned short)port);
- ret = in6_pton(str, -1,
- (void *)&sock_in6->sin6_addr.in6_u, -1, &end);
- if (ret <= 0) {
- pr_err("in6_pton returned: %d\n", ret);
- return ERR_PTR(-EINVAL);
- }
} else {
- str = ip_str = &buf[0];
+ ip_str = &buf[0];
port_str = strstr(ip_str, ":");
if (!port_str) {
pr_err("Unable to locate \":port\""
@@ -230,17 +210,15 @@ static struct se_tpg_np *lio_target_call_addnptotpg(
}
*port_str = '\0'; /* Terminate string for IP */
port_str++; /* Skip over ":" */
+ }
- ret = kstrtoul(port_str, 0, &port);
- if (ret < 0) {
- pr_err("kstrtoul() failed for port_str: %d\n", ret);
- return ERR_PTR(ret);
- }
- sock_in = (struct sockaddr_in *)&sockaddr;
- sock_in->sin_family = AF_INET;
- sock_in->sin_port = htons((unsigned short)port);
- sock_in->sin_addr.s_addr = in_aton(ip_str);
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC, ip_str,
+ port_str, &sockaddr);
+ if (ret) {
+ pr_err("malformed ip/port passed: %s\n", name);
+ return ERR_PTR(ret);
}
+
tpg = container_of(se_tpg, struct iscsi_portal_group, tpg_se_tpg);
ret = iscsit_get_tpg(tpg);
if (ret < 0)
diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c
index c754ae33bf7b1..d2f089cfa9aed 100644
--- a/drivers/target/target_core_device.c
+++ b/drivers/target/target_core_device.c
@@ -851,7 +851,7 @@ bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib,
attrib->unmap_granularity = q->limits.discard_granularity / block_size;
attrib->unmap_granularity_alignment = q->limits.discard_alignment /
block_size;
- attrib->unmap_zeroes_data = q->limits.discard_zeroes_data;
+ attrib->unmap_zeroes_data = 0;
return true;
}
EXPORT_SYMBOL(target_configure_unmap_from_queue);
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 94cda7991e80a..a93d94e68ab5f 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1008,7 +1008,7 @@ pscsi_execute_cmd(struct se_cmd *cmd)
req->timeout = PS_TIMEOUT_DISK;
else
req->timeout = PS_TIMEOUT_OTHER;
- req->retries = PS_RETRY;
+ scsi_req(req)->retries = PS_RETRY;
blk_execute_rq_nowait(pdv->pdv_sd->request_queue, NULL, req,
(cmd->sam_task_attr == TCM_HEAD_TAG),
@@ -1050,7 +1050,7 @@ static void pscsi_req_done(struct request *req, int uptodate)
struct se_cmd *cmd = req->end_io_data;
struct pscsi_plugin_task *pt = cmd->priv;
- pt->pscsi_result = req->errors;
+ pt->pscsi_result = scsi_req(req)->result;
pt->pscsi_resid = scsi_req(req)->resid_len;
cmd->scsi_status = status_byte(pt->pscsi_result) << 1;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index a89f3cfe3c7d7..c202930086edb 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
goto err_names;
init_rwsem(&v9ses->rename_sem);
- rc = bdi_setup_and_register(&v9ses->bdi, "9p");
- if (rc)
- goto err_names;
-
v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
if (IS_ERR(v9ses->clnt)) {
rc = PTR_ERR(v9ses->clnt);
p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
- goto err_bdi;
+ goto err_names;
}
v9ses->flags = V9FS_ACCESS_USER;
@@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
err_clnt:
p9_client_destroy(v9ses->clnt);
-err_bdi:
- bdi_destroy(&v9ses->bdi);
err_names:
kfree(v9ses->uname);
kfree(v9ses->aname);
@@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
kfree(v9ses->uname);
kfree(v9ses->aname);
- bdi_destroy(&v9ses->bdi);
-
spin_lock(&v9fs_sessionlist_lock);
list_del(&v9ses->slist);
spin_unlock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 443d12e020436..76eaf49abd3ae 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -114,7 +114,6 @@ struct v9fs_session_info {
kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
- struct backing_dev_info bdi;
struct rw_semaphore rename_sem;
};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed86291969..a0965fb587a5f 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data)
*
*/
-static void
+static int
v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
int flags, void *data)
{
+ int ret;
+
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
sb->s_blocksize = 1 << sb->s_blocksize_bits;
@@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
sb->s_xattr = v9fs_xattr_handlers;
} else
sb->s_op = &v9fs_super_ops;
- sb->s_bdi = &v9ses->bdi;
+
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
@@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
#endif
save_mount_options(sb, data);
+ return 0;
}
/**
@@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
retval = PTR_ERR(sb);
goto clunk_fid;
}
- v9fs_fill_super(sb, v9ses, flags, data);
+ retval = v9fs_fill_super(sb, v9ses, flags, data);
+ if (retval)
+ goto release_sb;
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
sb->s_d_op = &v9fs_cached_dentry_operations;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a6901360fb81d..393672997cc23 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -318,7 +318,6 @@ struct afs_volume {
unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */
struct afs_server *servers[8]; /* servers on which volume resides (ordered) */
struct rw_semaphore server_sem; /* lock for accessing current server */
- struct backing_dev_info bdi;
};
/*
diff --git a/fs/afs/super.c b/fs/afs/super.c
index fbdb022b75a27..c79633e5cfd80 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb,
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op = &afs_super_ops;
- sb->s_bdi = &as->volume->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret)
+ return ret;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
/* allocate the root inode and dentry */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 546f9d01710b5..db73d6dad02b5 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
volume->cell = params->cell;
volume->vid = vlocation->vldb.vid[params->type];
- volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE;
- ret = bdi_setup_and_register(&volume->bdi, "afs");
- if (ret)
- goto error_bdi;
-
init_rwsem(&volume->server_sem);
/* look up all the applicable server records */
@@ -156,8 +151,6 @@ error:
return ERR_PTR(ret);
error_discard:
- bdi_destroy(&volume->bdi);
-error_bdi:
up_write(&params->cell->vl_sem);
for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume)
for (loop = volume->nservers - 1; loop >= 0; loop--)
afs_put_server(volume->servers[loop]);
- bdi_destroy(&volume->bdi);
kfree(volume);
_leave(" [destroyed]");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2eca00ec43706..9ccabe3bb7de1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -885,6 +885,8 @@ static void bdev_evict_inode(struct inode *inode)
spin_lock(&bdev_lock);
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
+ /* Detach inode from wb early as bdi_put() may free bdi->wb */
+ inode_detach_wb(inode);
if (bdev->bd_bdi != &noop_backing_dev_info) {
bdi_put(bdev->bd_bdi);
bdev->bd_bdi = &noop_backing_dev_info;
@@ -1451,7 +1453,6 @@ int revalidate_disk(struct gendisk *disk)
if (disk->fops->revalidate_disk)
ret = disk->fops->revalidate_disk(disk);
- blk_integrity_revalidate(disk);
bdev = bdget_disk(disk, 0);
if (!bdev)
return ret;
@@ -1556,8 +1557,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (bdev->bd_bdi == &noop_backing_dev_info)
- bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
if (!partno) {
ret = -ENXIO;
@@ -1622,6 +1621,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
}
+
+ if (bdev->bd_bdi == &noop_backing_dev_info)
+ bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
} else {
if (bdev->bd_contains == bdev) {
ret = 0;
@@ -1653,8 +1655,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
- bdi_put(bdev->bd_bdi);
- bdev->bd_bdi = &noop_backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1876,12 +1876,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
kill_bdev(bdev);
bdev_write_inode(bdev);
- /*
- * Detaching bdev inode from its wb in __destroy_inode()
- * is too late: the queue which embeds its bdi (along with
- * root wb) can be gone as soon as we put_disk() below.
- */
- inode_detach_wb(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
@@ -2074,7 +2068,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
loff_t len)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
- struct request_queue *q = bdev_get_queue(bdev);
struct address_space *mapping;
loff_t end = start + len - 1;
loff_t isize;
@@ -2110,18 +2103,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
- GFP_KERNEL, false);
+ GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
- /* Only punch if the device can do zeroing discard. */
- if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data)
- return -EOPNOTSUPP;
- error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, 0);
+ error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+ GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
GFP_KERNEL, 0);
break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c4115901d9064..3e21211e99c39 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -810,7 +810,6 @@ struct btrfs_fs_info {
struct btrfs_super_block *super_for_commit;
struct super_block *sb;
struct inode *btree_inode;
- struct backing_dev_info bdi;
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index eb1ee7b6f532b..061c1d1f774f2 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
return ret;
}
-static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
-{
- int err;
-
- err = bdi_setup_and_register(bdi, "btrfs");
- if (err)
- return err;
-
- bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
- bdi->congested_fn = btrfs_congested_fn;
- bdi->congested_data = info;
- bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
- return 0;
-}
-
/*
* called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens
@@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb,
goto fail;
}
- ret = setup_bdi(fs_info, &fs_info->bdi);
- if (ret) {
- err = ret;
- goto fail_srcu;
- }
-
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret) {
err = ret;
- goto fail_bdi;
+ goto fail_srcu;
}
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
- sb->s_bdi = &fs_info->bdi;
btrfs_init_btree_inode(fs_info);
@@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
- fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
- fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
- SZ_4M / PAGE_SIZE);
+ sb->s_bdi->congested_fn = btrfs_congested_fn;
+ sb->s_bdi->congested_data = fs_info;
+ sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+ sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
@@ -3285,8 +3266,6 @@ fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
-fail_bdi:
- bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
@@ -4007,7 +3986,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
percpu_counter_destroy(&fs_info->bio_counter);
- bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9530a333d302c..72a053c9a7f09 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1136,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_flags |= MS_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
+
+ err = super_setup_bdi(sb);
+ if (err) {
+ btrfs_err(fs_info, "super_setup_bdi failed");
+ return err;
+ }
+
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1a3e1b40799a0..9ecb2fd348cb3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
if (writeback_stat >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
- set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -700,7 +700,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(&fsc->writeback_count) <
CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
- clear_bdi_congested(&fsc->backing_dev_info,
+ clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
if (rc < 0)
@@ -979,7 +979,7 @@ get_more_pages:
if (atomic_long_inc_return(&fsc->writeback_count) >
CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) {
- set_bdi_congested(&fsc->backing_dev_info,
+ set_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2ae393e2c31a..3ef11bc8d728d 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -251,7 +251,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
snprintf(name, sizeof(name), "../../bdi/%s",
- dev_name(fsc->backing_dev_info.dev));
+ dev_name(fsc->sb->s_bdi->dev));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
fsc->client->debugfs_dir,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 0ec8d0114e57b..a8c81b2052ca9 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -579,10 +579,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
atomic_long_set(&fsc->writeback_count, 0);
- err = bdi_init(&fsc->backing_dev_info);
- if (err < 0)
- goto fail_client;
-
err = -ENOMEM;
/*
* The number of concurrent works can be high but they don't need
@@ -590,7 +586,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
*/
fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
if (fsc->wb_wq == NULL)
- goto fail_bdi;
+ goto fail_client;
fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
@@ -624,8 +620,6 @@ fail_pg_inv_wq:
destroy_workqueue(fsc->pg_inv_wq);
fail_wb_wq:
destroy_workqueue(fsc->wb_wq);
-fail_bdi:
- bdi_destroy(&fsc->backing_dev_info);
fail_client:
ceph_destroy_client(fsc->client);
fail:
@@ -643,8 +637,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
- bdi_destroy(&fsc->backing_dev_info);
-
mempool_destroy(fsc->wb_pagevec_pool);
destroy_mount_options(fsc->mount_options);
@@ -937,33 +929,32 @@ static int ceph_compare_super(struct super_block *sb, void *data)
*/
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-static int ceph_register_bdi(struct super_block *sb,
- struct ceph_fs_client *fsc)
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
{
int err;
+ err = super_setup_bdi_name(sb, "ceph-%ld",
+ atomic_long_inc_return(&bdi_seq));
+ if (err)
+ return err;
+
/* set ra_pages based on rasize mount option? */
if (fsc->mount_options->rasize >= PAGE_SIZE)
- fsc->backing_dev_info.ra_pages =
+ sb->s_bdi->ra_pages =
(fsc->mount_options->rasize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else
- fsc->backing_dev_info.ra_pages =
- VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+ sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
if (fsc->mount_options->rsize > fsc->mount_options->rasize &&
fsc->mount_options->rsize >= PAGE_SIZE)
- fsc->backing_dev_info.io_pages =
+ sb->s_bdi->io_pages =
(fsc->mount_options->rsize + PAGE_SIZE - 1)
>> PAGE_SHIFT;
else if (fsc->mount_options->rsize == 0)
- fsc->backing_dev_info.io_pages = ULONG_MAX;
+ sb->s_bdi->io_pages = ULONG_MAX;
- err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
- atomic_long_inc_return(&bdi_seq));
- if (!err)
- sb->s_bdi = &fsc->backing_dev_info;
- return err;
+ return 0;
}
static struct dentry *ceph_mount(struct file_system_type *fs_type,
@@ -1018,7 +1009,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
dout("get_sb got existing client %p\n", fsc);
} else {
dout("get_sb using new client %p\n", fsc);
- err = ceph_register_bdi(sb, fsc);
+ err = ceph_setup_bdi(sb, fsc);
if (err < 0) {
res = ERR_PTR(err);
goto out_splat;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe6b9cfc4013e..176186b124575 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -92,8 +92,6 @@ struct ceph_fs_client {
struct workqueue_struct *trunc_wq;
atomic_long_t writeback_count;
- struct backing_dev_info backing_dev_info;
-
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_dentry_lru, *debugfs_caps;
struct dentry *debugfs_congestion_kb;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 07ed81cf1552e..cbd216b572390 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -68,7 +68,6 @@ struct cifs_sb_info {
umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
char *mountdata; /* options received at mount time or via DFS refs */
- struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
struct rcu_head rcu;
char *prepath;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index dd3f5fabfdf6a..34fee9fb7e4fe 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -138,7 +138,12 @@ cifs_read_super(struct super_block *sb)
sb->s_magic = CIFS_MAGIC_NUMBER;
sb->s_op = &cifs_super_ops;
sb->s_xattr = cifs_xattr_handlers;
- sb->s_bdi = &cifs_sb->bdi;
+ rc = super_setup_bdi(sb);
+ if (rc)
+ goto out_no_root;
+ /* tune readahead according to rsize */
+ sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE;
+
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */
inode = cifs_root_iget(sb);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d82467cfb0e2d..b3c9d8c310f2e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3692,10 +3692,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
int referral_walks_count = 0;
#endif
- rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
- if (rc)
- return rc;
-
#ifdef CONFIG_CIFS_DFS_UPCALL
try_mount_again:
/* cleanup activities if we're chasing a referral */
@@ -3723,7 +3719,6 @@ try_mount_again:
server = cifs_get_tcp_session(volume_info);
if (IS_ERR(server)) {
rc = PTR_ERR(server);
- bdi_destroy(&cifs_sb->bdi);
goto out;
}
if ((volume_info->max_credits < 20) ||
@@ -3780,9 +3775,6 @@ try_mount_again:
cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
- /* tune readahead according to rsize */
- cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
-
remote_path_check:
#ifdef CONFIG_CIFS_DFS_UPCALL
/*
@@ -3899,7 +3891,6 @@ mount_fail_check:
cifs_put_smb_ses(ses);
else
cifs_put_tcp_session(server, 0);
- bdi_destroy(&cifs_sb->bdi);
}
out:
@@ -4102,7 +4093,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
}
spin_unlock(&cifs_sb->tlink_tree_lock);
- bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
kfree(cifs_sb->prepath);
call_rcu(&cifs_sb->rcu, delayed_free);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2dea594da1996..6058df380cc00 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
goto unlock_out;
}
- error = bdi_setup_and_register(&vc->bdi, "coda");
- if (error)
- goto unlock_out;
-
vc->vc_sb = sb;
mutex_unlock(&vc->vc_mutex);
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
sb->s_d_op = &coda_dentry_operations;
- sb->s_bdi = &vc->bdi;
+
+ error = super_setup_bdi(sb);
+ if (error)
+ goto error;
/* get root fid from Venus: this needs the root inode */
error = venus_rootfid(sb, &fid);
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
error:
mutex_lock(&vc->vc_mutex);
- bdi_destroy(&vc->bdi);
vc->vc_sb = NULL;
sb->s_fs_info = NULL;
unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
{
struct venus_comm *vcp = coda_vcp(sb);
mutex_lock(&vcp->vc_mutex);
- bdi_destroy(&vcp->bdi);
vcp->vc_sb = NULL;
sb->s_fs_info = NULL;
mutex_unlock(&vcp->vc_mutex);
diff --git a/fs/dax.c b/fs/dax.c
index 85abd741253d4..6433650be8338 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -991,7 +991,7 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
sector_t start_sector = dax.sector + (offset >> 9);
return blkdev_issue_zeroout(bdev, start_sector,
- length >> 9, GFP_NOFS, true);
+ length >> 9, GFP_NOFS, 0);
} else {
if (dax_map_atomic(bdev, &dax) < 0)
return PTR_ERR(dax.addr);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 95c1c8d345392..9c351bf757b20 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat {
struct ecryptfs_sb_info {
struct super_block *wsi_sb;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
- struct backing_dev_info bdi;
};
/* file private data. */
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 151872dcc1f40..9014479d01600 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
- rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
+ rc = super_setup_bdi(s);
if (rc)
goto out1;
ecryptfs_set_superblock_private(s, sbi);
- s->s_bdi = &sbi->bdi;
/* ->kill_sb() will take care of sbi after that point */
sbi = NULL;
@@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb)
if (!sb_info)
return;
ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
- bdi_destroy(&sb_info->bdi);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
}
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2e86086bc9403..5dc392404559b 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -64,7 +64,6 @@ struct exofs_dev {
* our extension to the in-memory superblock
*/
struct exofs_sb_info {
- struct backing_dev_info bdi; /* register our bdi with VFS */
struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
int s_timeout; /* timeout for OSD operations */
uint64_t s_nextid; /* highest object ID used */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 1076a4233b396..819624cfc8da4 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb)
sbi->one_comp.obj.partition);
exofs_sysfs_sb_del(sbi);
- bdi_destroy(&sbi->bdi);
exofs_free_sbi(sbi);
sb->s_fs_info = NULL;
}
@@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
__sbi_read_stats(sbi);
/* set up operation vectors */
- sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
- sb->s_bdi = &sbi->bdi;
+ ret = super_setup_bdi(sb);
+ if (ret) {
+ EXOFS_DBGMSG("Failed to super_setup_bdi\n");
+ goto free_sbi;
+ }
+ sb->s_bdi->ra_pages = __ra_pages(&sbi->layout);
sb->s_fs_info = sbi;
sb->s_op = &exofs_sops;
sb->s_export_op = &exofs_export_ops;
@@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- ret = bdi_setup_and_register(&sbi->bdi, "exofs");
- if (ret) {
- EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
- dput(sb->s_root);
- sb->s_root = NULL;
- goto free_sbi;
- }
-
exofs_sysfs_dbg_print();
_exofs_print_device("Mounting", opts->dev_name,
ore_comp_dev(&sbi->oc, 0),
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index b681b43c766e1..c2d7f3a92679d 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
wake_up(&fc->blocked_waitq);
if (fc->num_background == fc->congestion_threshold &&
- fc->connected && fc->bdi_initialized) {
- clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ fc->connected && fc->sb) {
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
@@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
fc->num_background++;
if (fc->num_background == fc->max_background)
fc->blocked = 1;
- if (fc->num_background == fc->congestion_threshold &&
- fc->bdi_initialized) {
- set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
- set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ if (fc->num_background == fc->congestion_threshold && fc->sb) {
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+ set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(&req->list, &fc->bg_queue);
flush_bg_queue(fc);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 32ac2c9b09c03..f33341d9501a0 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -527,9 +527,6 @@ struct fuse_conn {
/** Filesystem supports NFS exporting. Only set in INIT */
unsigned export_support:1;
- /** Set if bdi is valid */
- unsigned bdi_initialized:1;
-
/** write-back cache policy (default is write-through) */
unsigned writeback_cache:1;
@@ -631,9 +628,6 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
- /** Backing dev info */
- struct backing_dev_info bdi;
-
/** Entry on the fuse_conn_list */
struct list_head entry;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6fe6a88ecb4af..73cf051352521 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -386,12 +386,6 @@ static void fuse_send_destroy(struct fuse_conn *fc)
}
}
-static void fuse_bdi_destroy(struct fuse_conn *fc)
-{
- if (fc->bdi_initialized)
- bdi_destroy(&fc->bdi);
-}
-
static void fuse_put_super(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -403,7 +397,6 @@ static void fuse_put_super(struct super_block *sb)
list_del(&fc->entry);
fuse_ctl_remove_conn(fc);
mutex_unlock(&fuse_mutex);
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
}
@@ -928,7 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->no_flock = 1;
}
- fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+ fc->sb->s_bdi->ra_pages =
+ min(fc->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -944,7 +938,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
- arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
+ arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -976,27 +970,18 @@ static void fuse_free_conn(struct fuse_conn *fc)
static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
{
int err;
+ char *suffix = "";
- fc->bdi.name = "fuse";
- fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
- /* fuse does it's own writeback accounting */
- fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
-
- err = bdi_init(&fc->bdi);
+ if (sb->s_bdev)
+ suffix = "-fuseblk";
+ err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
+ MINOR(fc->dev), suffix);
if (err)
return err;
- fc->bdi_initialized = 1;
-
- if (sb->s_bdev) {
- err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
- MAJOR(fc->dev), MINOR(fc->dev));
- } else {
- err = bdi_register_dev(&fc->bdi, fc->dev);
- }
-
- if (err)
- return err;
+ sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+ /* fuse does it's own writeback accounting */
+ sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
/*
* For a single fuse filesystem use max 1% of dirty +
@@ -1010,7 +995,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
*
* /sys/class/bdi/<bdi>/max_ratio
*/
- bdi_set_max_ratio(&fc->bdi, 1);
+ bdi_set_max_ratio(sb->s_bdi, 1);
return 0;
}
@@ -1113,8 +1098,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto err_dev_free;
- sb->s_bdi = &fc->bdi;
-
/* Handle umasking inside the fuse code */
if (sb->s_flags & MS_POSIXACL)
fc->dont_mask = 1;
@@ -1182,7 +1165,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
err_dev_free:
fuse_dev_free(fud);
err_put_conn:
- fuse_bdi_destroy(fc);
fuse_conn_put(fc);
err_fput:
fput(file);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108e7ba81af7..ed67548b286cc 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -23,6 +23,7 @@
#include <linux/quotaops.h>
#include <linux/lockdep.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include "gfs2.h"
#include "incore.h"
@@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
-
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
return 0;
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d5606099712a4..6d0f14c860997 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
sb->s_magic = NCP_SUPER_MAGIC;
sb->s_op = &ncp_sops;
sb->s_d_op = &ncp_dentry_operations;
- sb->s_bdi = &server->bdi;
server = NCP_SBP(sb);
memset(server, 0, sizeof(*server));
- error = bdi_setup_and_register(&server->bdi, "ncpfs");
+ error = super_setup_bdi(sb);
if (error)
goto out_fput;
@@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (data.info_fd != -1) {
struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
if (!info_sock)
- goto out_bdi;
+ goto out_fput;
server->info_sock = info_sock;
error = -EBADFD;
if (info_sock->type != SOCK_STREAM)
@@ -746,8 +745,6 @@ out_nls:
out_fput2:
if (server->info_sock)
sockfd_put(server->info_sock);
-out_bdi:
- bdi_destroy(&server->bdi);
out_fput:
sockfd_put(sock);
out:
@@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb)
kill_pid(server->m.wdog_pid, SIGTERM, 1);
put_pid(server->m.wdog_pid);
- bdi_destroy(&server->bdi);
kfree(server->priv.data);
kfree(server->auth.object_name);
vfree(server->rxbuf);
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index 55e26fd808869..366fd63cc506f 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -143,7 +143,6 @@ struct ncp_server {
size_t len;
__u8 data[128];
} unexpected_packet;
- struct backing_dev_info bdi;
};
extern void ncp_tcp_rcv_proc(struct work_struct *work);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 390ada8741bcb..04d15a0045e37 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -761,9 +761,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
- server->backing_dev_info.name = "nfs";
- server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -917,12 +914,6 @@ struct nfs_server *nfs_alloc_server(void)
return NULL;
}
- if (bdi_init(&server->backing_dev_info)) {
- nfs_free_iostats(server->io_stats);
- kfree(server);
- return NULL;
- }
-
ida_init(&server->openowner_id);
ida_init(&server->lockowner_id);
pnfs_init_server(server);
@@ -953,7 +944,6 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
nfs_free_iostats(server->io_stats);
- bdi_destroy(&server->backing_dev_info);
kfree(server);
nfs_release_automount_timer();
dprintk("<-- nfs_free_server()\n");
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7b38fedb7e032..9dc65d7ae7541 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -139,7 +139,7 @@ struct nfs_mount_request {
};
struct nfs_mount_info {
- void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+ int (*fill_super)(struct super_block *, struct nfs_mount_info *);
int (*set_security)(struct super_block *, struct dentry *, struct nfs_mount_info *);
struct nfs_parsed_mount_data *parsed;
struct nfs_clone_mount *cloned;
@@ -407,7 +407,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, const char *, void *
struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
const char *, struct nfs_mount_info *);
void nfs_kill_super(struct super_block *);
-void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+int nfs_fill_super(struct super_block *, struct nfs_mount_info *);
extern struct rpc_stat nfs_rpcstat;
@@ -458,7 +458,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
/* super.c */
-void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+int nfs_clone_super(struct super_block *, struct nfs_mount_info *);
void nfs_umount_begin(struct super_block *);
int nfs_statfs(struct dentry *, struct kstatfs *);
int nfs_show_options(struct seq_file *, struct dentry *);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 54e0f9f2dd949..dc69314d455e7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2315,18 +2315,17 @@ inline void nfs_initialise_sb(struct super_block *sb)
sb->s_blocksize = nfs_block_bits(server->wsize,
&sb->s_blocksize_bits);
- sb->s_bdi = &server->backing_dev_info;
-
nfs_super_set_maxbytes(sb, server->maxfilesize);
}
/*
* Finish setting up an NFS2/3 superblock
*/
-void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
{
struct nfs_parsed_mount_data *data = mount_info->parsed;
struct nfs_server *server = NFS_SB(sb);
+ int ret;
sb->s_blocksize_bits = 0;
sb->s_blocksize = 0;
@@ -2344,13 +2343,21 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
}
nfs_initialise_sb(sb);
+
+ ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev),
+ MINOR(server->s_dev));
+ if (ret)
+ return ret;
+ sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+ return 0;
+
}
EXPORT_SYMBOL_GPL(nfs_fill_super);
/*
* Finish setting up a cloned NFS2/3/4 superblock
*/
-void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
{
const struct super_block *old_sb = mount_info->cloned->sb;
struct nfs_server *server = NFS_SB(sb);
@@ -2370,6 +2377,10 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
}
nfs_initialise_sb(sb);
+
+ sb->s_bdi = bdi_get(old_sb->s_bdi);
+
+ return 0;
}
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
@@ -2522,11 +2533,6 @@ static void nfs_get_cache_cookie(struct super_block *sb,
}
#endif
-static int nfs_bdi_register(struct nfs_server *server)
-{
- return bdi_register_dev(&server->backing_dev_info, server->s_dev);
-}
-
int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot,
struct nfs_mount_info *mount_info)
{
@@ -2594,17 +2600,14 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
nfs_free_server(server);
server = NULL;
} else {
- error = nfs_bdi_register(server);
- if (error) {
- mntroot = ERR_PTR(error);
- goto error_splat_super;
- }
server->super = s;
}
if (!s->s_root) {
/* initial superblock/root creation */
- mount_info->fill_super(s, mount_info);
+ error = mount_info->fill_super(s, mount_info);
+ if (error)
+ goto error_splat_super;
nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42e..cc341fc7fd442 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -263,16 +263,15 @@ int nfs_congestion_kb;
static void nfs_set_page_writeback(struct page *page)
{
- struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host);
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_server *nfss = NFS_SERVER(inode);
int ret = test_set_page_writeback(page);
WARN_ON_ONCE(ret != 0);
if (atomic_long_inc_return(&nfss->writeback) >
- NFS_CONGESTION_ON_THRESH) {
- set_bdi_congested(&nfss->backing_dev_info,
- BLK_RW_ASYNC);
- }
+ NFS_CONGESTION_ON_THRESH)
+ set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
static void nfs_end_page_writeback(struct nfs_page *req)
@@ -285,7 +284,7 @@ static void nfs_end_page_writeback(struct nfs_page *req)
end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
}
@@ -1808,7 +1807,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
}
nfss = NFS_SERVER(data->inode);
if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
- clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
+ clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC);
nfs_init_cinfo(&cinfo, data->inode, data->dreq);
nfs_commit_end(cinfo.mds);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 92b4b41d19d2a..fb5213afc854e 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
req->cmd[4] = bufflen & 0xff;
req->cmd_len = COMMAND_SIZE(INQUIRY);
- error = blk_execute_rq(rq->q, NULL, rq, 1);
- if (error) {
+ blk_execute_rq(rq->q, NULL, rq, 1);
+ if (req->result) {
pr_err("pNFS: INQUIRY 0x83 failed with: %x\n",
- rq->errors);
+ req->result);
+ error = -EIO;
goto out_put_request;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index e1872f36147f5..926682981d61f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info;
+ sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi);
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/super.c b/fs/super.c
index b8b6a086c03b9..adb0c0de428c2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb)
hlist_del_init(&sb->s_instances);
spin_unlock(&sb_lock);
up_write(&sb->s_umount);
+ if (sb->s_bdi != &noop_backing_dev_info) {
+ bdi_put(sb->s_bdi);
+ sb->s_bdi = &noop_backing_dev_info;
+ }
}
EXPORT_SYMBOL(generic_shutdown_super);
@@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data)
{
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+ s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
- /*
- * We set the bdi here to the queue backing, file systems can
- * overwrite this in ->fill_super()
- */
- s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
}
@@ -1256,6 +1256,49 @@ out:
}
/*
+ * Setup private BDI for given superblock. It gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+ struct backing_dev_info *bdi;
+ int err;
+ va_list args;
+
+ bdi = bdi_alloc(GFP_KERNEL);
+ if (!bdi)
+ return -ENOMEM;
+
+ bdi->name = sb->s_type->name;
+
+ va_start(args, fmt);
+ err = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ if (err) {
+ bdi_put(bdi);
+ return err;
+ }
+ WARN_ON(sb->s_bdi != &noop_backing_dev_info);
+ sb->s_bdi = bdi;
+
+ return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+ return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+ atomic_long_inc_return(&bdi_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
+/*
* This is an internal function, please use sb_end_{write,pagefault,intwrite}
* instead.
*/
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b73811bd7676d..cf4cc99b75b55 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb)
}
ubifs_umount(c);
- bdi_destroy(&c->bdi);
ubi_close_volume(c->ubi);
mutex_unlock(&c->umount_mutex);
}
@@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
+ err = ubifs_parse_options(c, data, 0);
+ if (err)
+ goto out_close;
+
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
* UBIFS, I/O is not deferred, it is done immediately in readpage,
* which means the user would have to wait not just for their own I/O
* but the read-ahead I/O as well i.e. completely pointless.
*
- * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+ * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also
+ * @sb->s_bdi->capabilities are initialized to 0 so there won't be any
+ * writeback happening.
*/
- c->bdi.name = "ubifs",
- c->bdi.capabilities = 0;
- err = bdi_init(&c->bdi);
+ err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num,
+ c->vi.vol_id);
if (err)
goto out_close;
- err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d",
- c->vi.ubi_num, c->vi.vol_id);
- if (err)
- goto out_bdi;
-
- err = ubifs_parse_options(c, data, 0);
- if (err)
- goto out_bdi;
- sb->s_bdi = &c->bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
@@ -2080,8 +2075,6 @@ out_umount:
ubifs_umount(c);
out_unlock:
mutex_unlock(&c->umount_mutex);
-out_bdi:
- bdi_destroy(&c->bdi);
out_close:
ubi_close_volume(c->ubi);
out:
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 4d57e488038e3..4da10a6d702a0 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -972,7 +972,6 @@ struct ubifs_debug_info;
* struct ubifs_info - UBIFS file-system description data structure
* (per-superblock).
* @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable read-ahead
*
* @highest_inum: highest used inode number
* @max_sqnum: current global sequence number
@@ -1220,7 +1219,6 @@ struct ubifs_debug_info;
*/
struct ubifs_info {
struct super_block *vfs_sb;
- struct backing_dev_info bdi;
ino_t highest_inum;
unsigned long long max_sqnum;
@@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations;
extern const struct file_operations ubifs_dir_operations;
extern const struct inode_operations ubifs_dir_inode_operations;
extern const struct inode_operations ubifs_symlink_inode_operations;
-extern struct backing_dev_info ubifs_backing_dev_info;
extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
/* io.c */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 828532ce0adca..8795e9cd867cd 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -81,7 +81,7 @@ xfs_zero_extent(
return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
block << (mp->m_super->s_blocksize_bits - 9),
count_fsb << (mp->m_super->s_blocksize_bits - 9),
- GFP_NOFS, true);
+ GFP_NOFS, 0);
}
int
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ad955817916d0..866c433e7d322 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -21,6 +21,7 @@ struct dentry;
*/
enum wb_state {
WB_registered, /* bdi_register() was done */
+ WB_shutting_down, /* wb_shutdown() in progress */
WB_writeback_running, /* Writeback is in progress */
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
};
@@ -54,7 +55,9 @@ struct bdi_writeback_congested {
atomic_t refcnt; /* nr of attached wb's and blkg */
#ifdef CONFIG_CGROUP_WRITEBACK
- struct backing_dev_info *bdi; /* the associated bdi */
+ struct backing_dev_info *__bdi; /* the associated bdi, set to NULL
+ * on bdi unregistration. For memcg-wb
+ * internal use only! */
int blkcg_id; /* ID of the associated blkcg */
struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
#endif
@@ -143,7 +146,7 @@ struct backing_dev_info {
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data; /* Pointer to aux data for congested func */
- char *name;
+ const char *name;
struct kref refcnt; /* Reference counter for the structure */
unsigned int capabilities; /* Device capabilities */
@@ -161,7 +164,6 @@ struct backing_dev_info {
#ifdef CONFIG_CGROUP_WRITEBACK
struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
struct rb_root cgwb_congested_tree; /* their congested states */
- atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
#else
struct bdi_writeback_congested *wb_congested;
#endif
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c52a48cb9a663..557d84063934c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -17,8 +17,6 @@
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>
-int __must_check bdi_init(struct backing_dev_info *bdi);
-
static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
kref_get(&bdi->refcnt);
@@ -27,16 +25,18 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
void bdi_put(struct backing_dev_info *bdi);
-__printf(3, 4)
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
+__printf(2, 3)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
+ va_list args);
int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_destroy(struct backing_dev_info *bdi);
struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
+static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
+{
+ return bdi_alloc_node(gfp_mask, NUMA_NO_NODE);
+}
void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
bool range_cyclic, enum wb_reason reason);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 8e521194f6fc4..4931756d86d99 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int);
extern void bioset_free(struct bio_set *);
extern mempool_t *biovec_create_pool(int pool_entries);
-extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
+extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *);
extern void bio_put(struct bio *);
extern void __bio_clone_fast(struct bio *, struct bio *);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9382c5da7a2ed..f3e5e1de1bdb9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -15,7 +15,7 @@ struct blk_mq_hw_ctx {
unsigned long state; /* BLK_MQ_S_* flags */
} ____cacheline_aligned_in_smp;
- struct work_struct run_work;
+ struct delayed_work run_work;
cpumask_var_t cpumask;
int next_cpu;
int next_cpu_batch;
@@ -51,9 +51,6 @@ struct blk_mq_hw_ctx {
atomic_t nr_active;
- struct delayed_work delayed_run_work;
- struct delayed_work delay_work;
-
struct hlist_node cpuhp_dead;
struct kobject kobj;
@@ -82,7 +79,6 @@ struct blk_mq_tag_set {
struct blk_mq_queue_data {
struct request *rq;
- struct list_head *list;
bool last;
};
@@ -143,6 +139,14 @@ struct blk_mq_ops {
reinit_request_fn *reinit_request;
map_queues_fn *map_queues;
+
+#ifdef CONFIG_BLK_DEBUG_FS
+ /*
+ * Used by the debugfs implementation to show driver-specific
+ * information about a request.
+ */
+ void (*show_rq)(struct seq_file *m, struct request *rq);
+#endif
};
enum {
@@ -153,7 +157,6 @@ enum {
BLK_MQ_F_SHOULD_MERGE = 1 << 0,
BLK_MQ_F_TAG_SHARED = 1 << 1,
BLK_MQ_F_SG_MERGE = 1 << 2,
- BLK_MQ_F_DEFER_ISSUE = 1 << 4,
BLK_MQ_F_BLOCKING = 1 << 5,
BLK_MQ_F_NO_SCHED = 1 << 6,
BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -163,6 +166,7 @@ enum {
BLK_MQ_S_TAG_ACTIVE = 1,
BLK_MQ_S_SCHED_RESTART = 2,
BLK_MQ_S_TAG_WAITING = 3,
+ BLK_MQ_S_START_ON_RUN = 4,
BLK_MQ_MAX_DEPTH = 10240,
@@ -230,7 +234,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_abort_requeue_list(struct request_queue *q);
-void blk_mq_complete_request(struct request *rq, int error);
+void blk_mq_complete_request(struct request *rq);
bool blk_mq_queue_stopped(struct request_queue *q);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
@@ -240,13 +244,14 @@ void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q);
-void blk_mq_freeze_queue_start(struct request_queue *q);
+void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index d703acb55d0f0..61339bc444006 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -17,6 +17,10 @@ struct io_context;
struct cgroup_subsys_state;
typedef void (bio_end_io_t) (struct bio *);
+struct blk_issue_stat {
+ u64 stat;
+};
+
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
@@ -29,7 +33,7 @@ struct bio {
* top bits REQ_OP. Use
* accessors.
*/
- unsigned short bi_flags; /* status, command, etc */
+ unsigned short bi_flags; /* status, etc and bvec pool number */
unsigned short bi_ioprio;
struct bvec_iter bi_iter;
@@ -58,6 +62,10 @@ struct bio {
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+ void *bi_cg_private;
+ struct blk_issue_stat bi_issue_stat;
+#endif
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -102,12 +110,9 @@ struct bio {
#define BIO_REFFED 8 /* bio has elevated ->bi_cnt */
#define BIO_THROTTLED 9 /* This bio has already been subjected to
* throttling rules. Don't do it again. */
-
-/*
- * Flags starting here get preserved by bio_reset() - this includes
- * BVEC_POOL_IDX()
- */
-#define BIO_RESET_BITS 10
+#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion
+ * of this bio. */
+/* See BVEC_POOL_OFFSET below before adding new flags */
/*
* We support 6 different bvec pools, the last one is magic in that it
@@ -117,13 +122,22 @@ struct bio {
#define BVEC_POOL_MAX (BVEC_POOL_NR - 1)
/*
- * Top 4 bits of bio flags indicate the pool the bvecs came from. We add
+ * Top 3 bits of bio flags indicate the pool the bvecs came from. We add
* 1 to the actual index so that 0 indicates that there are no bvecs to be
* freed.
*/
-#define BVEC_POOL_BITS (4)
+#define BVEC_POOL_BITS (3)
#define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS)
#define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET)
+#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1)
+# error "BVEC_POOL_BITS is too small"
+#endif
+
+/*
+ * Flags starting here get preserved by bio_reset() - this includes
+ * only BVEC_POOL_IDX()
+ */
+#define BIO_RESET_BITS BVEC_POOL_OFFSET
/*
* Operations and flags common to the bio and request structures.
@@ -160,7 +174,7 @@ enum req_opf {
/* write the same sector many times */
REQ_OP_WRITE_SAME = 7,
/* write the zero filled sector many times */
- REQ_OP_WRITE_ZEROES = 8,
+ REQ_OP_WRITE_ZEROES = 9,
/* SCSI passthrough using struct scsi_request */
REQ_OP_SCSI_IN = 32,
@@ -187,6 +201,10 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_BACKGROUND, /* background IO */
+
+ /* command specific flags for REQ_OP_WRITE_ZEROES: */
+ __REQ_NOUNMAP, /* do not free blocks when zeroing */
+
__REQ_NR_BITS, /* stops here */
};
@@ -204,6 +222,8 @@ enum req_flag_bits {
#define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
#define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
+#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP)
+
#define REQ_FAILFAST_MASK \
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
@@ -283,12 +303,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie)
return (cookie & BLK_QC_T_INTERNAL) != 0;
}
-struct blk_issue_stat {
- u64 time;
-};
-
-#define BLK_RQ_STAT_BATCH 64
-
struct blk_rq_stat {
s64 mean;
u64 min;
@@ -296,7 +310,6 @@ struct blk_rq_stat {
s32 nr_samples;
s32 nr_batch;
u64 batch;
- s64 time;
};
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 01a696b0a4d3a..83d28623645f3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -40,15 +40,20 @@ struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
struct rq_wb;
+struct blk_queue_stats;
+struct blk_stat_callback;
#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
+/* Must be consisitent with blk_mq_poll_stats_bkt() */
+#define BLK_MQ_POLL_STATS_BKTS 16
+
/*
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
*/
-#define BLKCG_MAX_POLS 2
+#define BLKCG_MAX_POLS 3
typedef void (rq_end_io_fn)(struct request *, int);
@@ -173,6 +178,7 @@ struct request {
struct rb_node rb_node; /* sort/lookup */
struct bio_vec special_vec;
void *completion_data;
+ int error_count; /* for legacy drivers, don't use */
};
/*
@@ -213,16 +219,14 @@ struct request {
unsigned short ioprio;
- void *special; /* opaque pointer available for LLD use */
+ unsigned int timeout;
- int errors;
+ void *special; /* opaque pointer available for LLD use */
unsigned int extra_len; /* length of alignment and padding */
unsigned long deadline;
struct list_head timeout_list;
- unsigned int timeout;
- int retries;
/*
* completion callback.
@@ -337,7 +341,6 @@ struct queue_limits {
unsigned char misaligned;
unsigned char discard_misaligned;
unsigned char cluster;
- unsigned char discard_zeroes_data;
unsigned char raid_partial_stripes_expensive;
enum blk_zoned_model zoned;
};
@@ -388,6 +391,7 @@ struct request_queue {
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
+ struct blk_queue_stats *stats;
struct rq_wb *rq_wb;
/*
@@ -505,8 +509,6 @@ struct request_queue {
unsigned int nr_sorted;
unsigned int in_flight[2];
- struct blk_rq_stat rq_stats[2];
-
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
@@ -516,6 +518,10 @@ struct request_queue {
unsigned int rq_timeout;
int poll_nsec;
+
+ struct blk_stat_callback *poll_cb;
+ struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS];
+
struct timer_list timeout;
struct work_struct timeout_work;
struct list_head timeout_list;
@@ -610,6 +616,8 @@ struct request_queue {
#define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */
#define QUEUE_FLAG_DAX 26 /* device supports DAX */
#define QUEUE_FLAG_STATS 27 /* track rq completion times */
+#define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */
+#define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */
#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \
(1 << QUEUE_FLAG_STACKABLE) | \
@@ -918,6 +926,7 @@ extern int blk_register_queue(struct gendisk *disk);
extern void blk_unregister_queue(struct gendisk *disk);
extern blk_qc_t generic_make_request(struct bio *bio);
extern void blk_rq_init(struct request_queue *q, struct request *rq);
+extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
extern void blk_put_request(struct request *);
extern void __blk_put_request(struct request_queue *, struct request *);
extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
@@ -963,7 +972,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns
extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
struct rq_map_data *, const struct iov_iter *,
gfp_t);
-extern int blk_execute_rq(struct request_queue *, struct gendisk *,
+extern void blk_execute_rq(struct request_queue *, struct gendisk *,
struct request *, int);
extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
struct request *, int, rq_end_io_fn *);
@@ -1081,20 +1090,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
}
/*
- * blk_rq_set_prio - associate a request with prio from ioc
- * @rq: request of interest
- * @ioc: target iocontext
- *
- * Assocate request prio with ioc prio so request based drivers
- * can leverage priority information.
- */
-static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc)
-{
- if (ioc)
- rq->ioprio = ioc->ioprio;
-}
-
-/*
* Request issue related functions.
*/
extern struct request *blk_peek_request(struct request_queue *q);
@@ -1120,13 +1115,10 @@ extern void blk_finish_request(struct request *rq, int error);
extern bool blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void blk_end_request_all(struct request *rq, int error);
-extern bool blk_end_request_cur(struct request *rq, int error);
-extern bool blk_end_request_err(struct request *rq, int error);
extern bool __blk_end_request(struct request *rq, int error,
unsigned int nr_bytes);
extern void __blk_end_request_all(struct request *rq, int error);
extern bool __blk_end_request_cur(struct request *rq, int error);
-extern bool __blk_end_request_err(struct request *rq, int error);
extern void blk_complete_request(struct request *);
extern void __blk_complete_request(struct request *);
@@ -1329,23 +1321,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
return bqt->tag_index[tag];
}
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
+extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, struct page *page);
#define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */
-#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */
-extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, int flags,
struct bio **biop);
-extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, struct page *page);
+
+#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */
+#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */
+
extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop,
- bool discard);
+ unsigned flags);
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, bool discard);
+ sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+
static inline int sb_issue_discard(struct super_block *sb, sector_t block,
sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
@@ -1359,7 +1355,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
return blkdev_issue_zeroout(sb->s_bdev,
block << (sb->s_blocksize_bits - 9),
nr_blocks << (sb->s_blocksize_bits - 9),
- gfp_mask, true);
+ gfp_mask, 0);
}
extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
@@ -1529,19 +1525,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev)
return q->limits.discard_alignment;
}
-static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
-{
- if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
- return 1;
-
- return 0;
-}
-
-static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
-{
- return queue_discard_zeroes_data(bdev_get_queue(bdev));
-}
-
static inline unsigned int bdev_write_same(struct block_device *bdev)
{
struct request_queue *q = bdev_get_queue(bdev);
@@ -1726,6 +1709,7 @@ int kblockd_schedule_work(struct work_struct *work);
int kblockd_schedule_work_on(int cpu, struct work_struct *work);
int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
+int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
#ifdef CONFIG_BLK_CGROUP
/*
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 5b8721efa948e..31e4e1f1547cc 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -15,7 +15,6 @@ struct venus_comm {
struct list_head vc_processing;
int vc_inuse;
struct super_block *vc_sb;
- struct backing_dev_info bdi;
struct mutex vc_mutex;
};
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index a7e6903866fdc..c7ea33e38fb9e 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -255,6 +255,12 @@ struct dm_target {
unsigned num_write_same_bios;
/*
+ * The number of WRITE ZEROES bios that will be submitted to the target.
+ * The bio number can be accessed with dm_bio_get_target_bio_nr.
+ */
+ unsigned num_write_zeroes_bios;
+
+ /*
* The minimum number of extra bytes allocated in each io for the
* target to use.
*/
@@ -290,11 +296,6 @@ struct dm_target {
* on max_io_len boundary.
*/
bool split_discard_bios:1;
-
- /*
- * Set if this target does not return zeroes on discarded blocks.
- */
- bool discard_zeroes_data_unsupported:1;
};
/* Each target can link one of these into the table */
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 22d39e8d4de16..3a216318ae739 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -93,6 +93,8 @@ struct blk_mq_hw_ctx;
struct elevator_mq_ops {
int (*init_sched)(struct request_queue *, struct elevator_type *);
void (*exit_sched)(struct elevator_queue *);
+ int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
@@ -104,7 +106,7 @@ struct elevator_mq_ops {
void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
bool (*has_work)(struct blk_mq_hw_ctx *);
- void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+ void (*completed_request)(struct request *);
void (*started_request)(struct request *);
void (*requeue_request)(struct request *);
struct request *(*former_request)(struct request_queue *, struct request *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7251f7bb45e8b..30e5c14bd7433 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2121,6 +2121,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
extern int freeze_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
extern bool our_mnt(struct vfsmount *mnt);
+extern __printf(2, 3)
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
+extern int super_setup_bdi(struct super_block *sb);
extern int current_umask(void);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 76f39754e7b02..acff9437e5c37 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -159,11 +159,11 @@ struct badblocks;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct blk_integrity {
- struct blk_integrity_profile *profile;
- unsigned char flags;
- unsigned char tuple_size;
- unsigned char interval_exp;
- unsigned char tag_size;
+ const struct blk_integrity_profile *profile;
+ unsigned char flags;
+ unsigned char tuple_size;
+ unsigned char interval_exp;
+ unsigned char tag_size;
};
#endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -722,11 +722,9 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
#if defined(CONFIG_BLK_DEV_INTEGRITY)
extern void blk_integrity_add(struct gendisk *);
extern void blk_integrity_del(struct gendisk *);
-extern void blk_integrity_revalidate(struct gendisk *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline void blk_integrity_add(struct gendisk *disk) { }
static inline void blk_integrity_del(struct gendisk *disk) { }
-static inline void blk_integrity_revalidate(struct gendisk *disk) { }
#endif /* CONFIG_BLK_DEV_INTEGRITY */
#else /* CONFIG_BLOCK */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 2f51c1724b5af..6980ca322074b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -88,7 +88,7 @@ static inline bool ata_pm_request(struct request *rq)
ide_req(rq)->type == ATA_PRIV_PM_RESUME);
}
-/* Error codes returned in rq->errors to the higher part of the driver. */
+/* Error codes returned in result to the higher part of the driver. */
enum {
IDE_DRV_ERROR_GENERAL = 101,
IDE_DRV_ERROR_FILEMARK = 102,
diff --git a/include/linux/inet.h b/include/linux/inet.h
index 4cca05c9678e7..636ebe87e6f88 100644
--- a/include/linux/inet.h
+++ b/include/linux/inet.h
@@ -43,6 +43,8 @@
#define _LINUX_INET_H
#include <linux/types.h>
+#include <net/net_namespace.h>
+#include <linux/socket.h>
/*
* These mimic similar macros defined in user-space for inet_ntop(3).
@@ -54,4 +56,8 @@
extern __be32 in_aton(const char *str);
extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end);
+
+extern int inet_pton_with_scope(struct net *net, unsigned short af,
+ const char *src, const char *port, struct sockaddr_storage *addr);
+
#endif /* _LINUX_INET_H */
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index e6284591599ec..ca85cb80e99a6 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name);
extern int __must_check kobject_move(struct kobject *, struct kobject *);
extern struct kobject *kobject_get(struct kobject *kobj);
+extern struct kobject * __must_check kobject_get_unless_zero(
+ struct kobject *kobj);
extern void kobject_put(struct kobject *kobj);
extern const void *kobject_namespace(struct kobject *kobj);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index ca45e4a088a91..7dfa56ebbc6d0 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -56,7 +56,6 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32,
typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *);
typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int);
typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *);
typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
typedef void (nvm_destroy_dma_pool_fn)(void *);
typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
@@ -70,7 +69,6 @@ struct nvm_dev_ops {
nvm_op_set_bb_fn *set_bb_tbl;
nvm_submit_io_fn *submit_io;
- nvm_erase_blk_fn *erase_block;
nvm_create_dma_pool_fn *create_dma_pool;
nvm_destroy_dma_pool_fn *destroy_dma_pool;
@@ -125,7 +123,7 @@ enum {
/* NAND Access Modes */
NVM_IO_SUSPEND = 0x80,
NVM_IO_SLC_MODE = 0x100,
- NVM_IO_SCRAMBLE_DISABLE = 0x200,
+ NVM_IO_SCRAMBLE_ENABLE = 0x200,
/* Block Types */
NVM_BLK_T_FREE = 0x0,
@@ -438,7 +436,8 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2)
typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *);
typedef sector_t (nvm_tgt_capacity_fn)(void *);
-typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *);
+typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *,
+ int flags);
typedef void (nvm_tgt_exit_fn)(void *);
typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *);
typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *);
@@ -479,10 +478,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *,
int, int);
extern int nvm_max_phys_sects(struct nvm_tgt_dev *);
extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *);
-extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *,
+extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *,
const struct ppa_addr *, int, int);
-extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *);
-extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int);
+extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *);
extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *,
void *);
extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t);
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index e11f4d9f1c2e0..0000000000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * include/linux/mg_disk.c
- *
- * Private data for mflash platform driver
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-/* names of GPIO resource */
-#define MG_RST_PIN "mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN "mg_rstout"
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV (1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV (1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST (1 << 2)
-
-/* private driver data */
-struct mg_drv_data {
- /* disk resource */
- u32 use_polling;
-
- /* device attribution */
- u32 dev_attr;
-
- /* internally used */
- void *host;
-};
-
-#endif
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index eebdc63cf6af9..79b176eca04a1 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -334,11 +334,6 @@ struct mtd_info {
int (*_get_device) (struct mtd_info *mtd);
void (*_put_device) (struct mtd_info *mtd);
- /* Backing device capabilities for this device
- * - provides mmap capabilities
- */
- struct backing_dev_info *backing_dev_info;
-
struct notifier_block reboot_notifier; /* default mode before reboot */
/* ECC status information */
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b34097c678486..e1502c55741ef 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -133,7 +133,6 @@ struct nfs_server {
struct rpc_clnt * client_acl; /* ACL RPC client handle */
struct nlm_host *nlm_host; /* NLM client handle */
struct nfs_iostats __percpu *io_stats; /* I/O statistics */
- struct backing_dev_info backing_dev_info;
atomic_long_t writeback; /* number of writeback pages */
int flags; /* various flags */
unsigned int caps; /* server capabilities */
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f21471f7ee407..0db37158a61d4 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir {
* transferred. Should equal payload_length on success.
* @rcv_rsplen: length, in bytes, of the FCP RSP IU received.
* @status: Completion status of the FCP operation. must be 0 upon success,
- * NVME_SC_FC_xxx value upon failure. Note: this is NOT a
- * reflection of the NVME CQE completion status. Only the status
- * of the FCP operation at the NVME-FC level.
+ * negative errno value upon failure (ex: -EIO). Note: this is
+ * NOT a reflection of the NVME CQE completion status. Only the
+ * status of the FCP operation at the NVME-FC level.
*/
struct nvmefc_fcp_req {
void *cmdaddr;
@@ -533,9 +533,6 @@ enum {
* rsp as well
*/
NVMET_FCOP_RSP = 4, /* send rsp frame */
- NVMET_FCOP_ABORT = 5, /* abort exchange via ABTS */
- NVMET_FCOP_BA_ACC = 6, /* send BA_ACC */
- NVMET_FCOP_BA_RJT = 7, /* send BA_RJT */
};
/**
@@ -572,8 +569,6 @@ enum {
* upon compeletion of the operation. The nvmet-fc layer will also set a
* private pointer for its own use in the done routine.
*
- * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !!
- *
* Values set by the NVMET-FC layer prior to calling the LLDD fcp_op
* entrypoint.
* @op: Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx)
@@ -655,6 +650,22 @@ enum {
* on. The transport should pick a cpu to schedule the work
* on.
*/
+ NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2),
+ /* Bit 2: When 0, the LLDD is calling the cmd rcv handler
+ * in a non-isr context, allowing the transport to finish
+ * op completion in the calling context. When 1, the LLDD
+ * is calling the cmd rcv handler in an ISR context,
+ * requiring the transport to transition to a workqueue
+ * for op completion.
+ */
+ NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3),
+ /* Bit 3: When 0, the LLDD is calling the op done handler
+ * in a non-isr context, allowing the transport to finish
+ * op completion in the calling context. When 1, the LLDD
+ * is calling the op done handler in an ISR context,
+ * requiring the transport to transition to a workqueue
+ * for op completion.
+ */
};
@@ -725,12 +736,12 @@ struct nvmet_fc_target_port {
* be freed/released.
* Entrypoint is Mandatory.
*
- * @fcp_op: Called to perform a data transfer, transmit a response, or
- * abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same
- * LLDD-supplied exchange structure specified in the
- * nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received.
- * The op field in the structure shall indicate the operation for
- * the LLDD to perform relative to the io.
+ * @fcp_op: Called to perform a data transfer or transmit a response.
+ * The nvmefc_tgt_fcp_req structure is the same LLDD-supplied
+ * exchange structure specified in the nvmet_fc_rcv_fcp_req() call
+ * made when the FCP CMD IU was received. The op field in the
+ * structure shall indicate the operation for the LLDD to perform
+ * relative to the io.
* NVMET_FCOP_READDATA operation: the LLDD is to send the
* payload data (described by sglist) to the host in 1 or
* more FC sequences (preferrably 1). Note: the fc-nvme layer
@@ -752,29 +763,31 @@ struct nvmet_fc_target_port {
* successfully, the LLDD is to update the nvmefc_tgt_fcp_req
* transferred_length field and may subsequently transmit the
* FCP_RSP iu payload (described by rspbuf, rspdma, rsplen).
- * The LLDD is to await FCP_CONF reception to confirm the RSP
- * reception by the host. The LLDD may retramsit the FCP_RSP iu
- * if necessary per FC-NVME. Upon reception of FCP_CONF, or upon
- * FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req
- * fcp_error field and consider the operation complete..
+ * If FCP_CONF is supported, the LLDD is to await FCP_CONF
+ * reception to confirm the RSP reception by the host. The LLDD
+ * may retramsit the FCP_RSP iu if necessary per FC-NVME. Upon
+ * transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ * or upon success/failure of FCP_CONF if it is supported, the
+ * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
+ * consider the operation complete.
* NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload
- * (described by rspbuf, rspdma, rsplen). The LLDD is to await
- * FCP_CONF reception to confirm the RSP reception by the host.
- * The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME.
- * Upon reception of FCP_CONF, or upon FCP_CONF failure, the
+ * (described by rspbuf, rspdma, rsplen). If FCP_CONF is
+ * supported, the LLDD is to await FCP_CONF reception to confirm
+ * the RSP reception by the host. The LLDD may retramsit the
+ * FCP_RSP iu if FCP_CONF is not received per FC-NVME. Upon
+ * transmission of the FCP_RSP iu if FCP_CONF is not supported,
+ * or upon success/failure of FCP_CONF if it is supported, the
* LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and
- * consider the operation complete..
- * NVMET_FCOP_ABORT: the LLDD is to terminate the exchange
- * corresponding to the fcp operation. The LLDD shall send
- * ABTS and follow FC exchange abort-multi rules, including
- * ABTS retries and possible logout.
+ * consider the operation complete.
* Upon completing the indicated operation, the LLDD is to set the
* status fields for the operation (tranferred_length and fcp_error
- * status) in the request, then all the "done" routine
- * indicated in the fcp request. Upon return from the "done"
- * routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation
- * the fc-nvme layer will not longer reference the fcp request,
- * allowing the LLDD to free/release the fcp request.
+ * status) in the request, then call the "done" routine
+ * indicated in the fcp request. After the operation completes,
+ * regardless of whether the FCP_RSP iu was successfully transmit,
+ * the LLDD-supplied exchange structure must remain valid until the
+ * transport calls the fcp_req_release() callback to return ownership
+ * of the exchange structure back to the LLDD so that it may be used
+ * for another fcp command.
* Note: when calling the done routine for READDATA or WRITEDATA
* operations, the fc-nvme layer may immediate convert, in the same
* thread and before returning to the LLDD, the fcp operation to
@@ -786,6 +799,22 @@ struct nvmet_fc_target_port {
* Returns 0 on success, -<errno> on failure (Ex: -EIO)
* Entrypoint is Mandatory.
*
+ * @fcp_abort: Called by the transport to abort an active command.
+ * The command may be in-between operations (nothing active in LLDD)
+ * or may have an active WRITEDATA operation pending. The LLDD is to
+ * initiate the ABTS process for the command and return from the
+ * callback. The ABTS does not need to be complete on the command.
+ * The fcp_abort callback inherently cannot fail. After the
+ * fcp_abort() callback completes, the transport will wait for any
+ * outstanding operation (if there was one) to complete, then will
+ * call the fcp_req_release() callback to return the command's
+ * exchange context back to the LLDD.
+ *
+ * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req
+ * to the LLDD after all operations on the fcp operation are complete.
+ * This may be due to the command completing or upon completion of
+ * abort cleanup.
+ *
* @max_hw_queues: indicates the maximum number of hw queues the LLDD
* supports for cpu affinitization.
* Value is Mandatory. Must be at least 1.
@@ -820,7 +849,11 @@ struct nvmet_fc_target_template {
int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport,
struct nvmefc_tgt_ls_req *tls_req);
int (*fcp_op)(struct nvmet_fc_target_port *tgtport,
- struct nvmefc_tgt_fcp_req *);
+ struct nvmefc_tgt_fcp_req *fcpreq);
+ void (*fcp_abort)(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *fcpreq);
+ void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *fcpreq);
u32 max_hw_queues;
u16 max_sgl_segments;
@@ -848,4 +881,7 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport,
struct nvmefc_tgt_fcp_req *fcpreq,
void *cmdiubuf, u32 cmdiubuf_len);
+void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport,
+ struct nvmefc_tgt_fcp_req *fcpreq);
+
#endif /* _NVME_FC_DRIVER_H */
diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h
index 4b45226bd604c..e997c4a49a888 100644
--- a/include/linux/nvme-fc.h
+++ b/include/linux/nvme-fc.h
@@ -16,8 +16,7 @@
*/
/*
- * This file contains definitions relative to FC-NVME r1.11 and a few
- * newer items
+ * This file contains definitions relative to FC-NVME r1.14 (16-020vB).
*/
#ifndef _NVME_FC_H
@@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu {
#define NVME_FC_SIZEOF_ZEROS_RSP 12
+enum {
+ FCNVME_SC_SUCCESS = 0,
+ FCNVME_SC_INVALID_FIELD = 1,
+ FCNVME_SC_INVALID_CONNID = 2,
+};
+
struct nvme_fc_ersp_iu {
- __u8 rsvd0[2];
+ __u8 status_code;
+ __u8 rsvd1;
__be16 iu_len;
__be32 rsn;
__be32 xfrd_len;
@@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu {
};
-/* FC-NVME r1.03/16-119v0 NVME Link Services */
+/* FC-NVME Link Services */
enum {
FCNVME_LS_RSVD = 0,
FCNVME_LS_RJT = 1,
@@ -68,7 +74,7 @@ enum {
FCNVME_LS_DISCONNECT = 5,
};
-/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */
+/* FC-NVME Link Service Descriptors */
enum {
FCNVME_LSDESC_RSVD = 0x0,
FCNVME_LSDESC_RQST = 0x1,
@@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz)
return cpu_to_be32(sz - (2 * sizeof(u32)));
}
-
struct fcnvme_ls_rqst_w0 {
u8 ls_cmd; /* FCNVME_LS_xxx */
u8 zeros[3];
@@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst {
__be32 rsvd12;
};
+/* FC-NVME LS RJT reason_code values */
+enum fcnvme_ls_rjt_reason {
+ FCNVME_RJT_RC_NONE = 0,
+ /* no reason - not to be sent */
+
+ FCNVME_RJT_RC_INVAL = 0x01,
+ /* invalid NVMe_LS command code */
+
+ FCNVME_RJT_RC_LOGIC = 0x03,
+ /* logical error */
+
+ FCNVME_RJT_RC_UNAB = 0x09,
+ /* unable to perform command request */
+
+ FCNVME_RJT_RC_UNSUP = 0x0b,
+ /* command not supported */
+
+ FCNVME_RJT_RC_INPROG = 0x0e,
+ /* command already in progress */
+ FCNVME_RJT_RC_INV_ASSOC = 0x40,
+ /* Invalid Association ID*/
+ FCNVME_RJT_RC_INV_CONN = 0x41,
+ /* Invalid Connection ID*/
+
+ FCNVME_RJT_RC_VENDOR = 0xff,
+ /* vendor specific error */
+};
+
+/* FC-NVME LS RJT reason_explanation values */
+enum fcnvme_ls_rjt_explan {
+ FCNVME_RJT_EXP_NONE = 0x00,
+ /* No additional explanation */
+
+ FCNVME_RJT_EXP_OXID_RXID = 0x17,
+ /* invalid OX_ID-RX_ID combination */
+
+ FCNVME_RJT_EXP_INSUF_RES = 0x29,
+ /* insufficient resources */
+
+ FCNVME_RJT_EXP_UNAB_DATA = 0x2a,
+ /* unable to supply requested data */
+
+ FCNVME_RJT_EXP_INV_LEN = 0x2d,
+ /* Invalid payload length */
+};
/* FCNVME_LSDESC_RJT */
struct fcnvme_lsdesc_rjt {
@@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt {
* Reject reason and explanaction codes are generic
* to ELs's from LS-3.
*/
- u8 reason_code;
- u8 reason_explanation;
+ u8 reason_code; /* fcnvme_ls_rjt_reason */
+ u8 reason_explanation; /* fcnvme_ls_rjt_explan */
u8 vendor;
__be32 rsvd12;
};
-#define FCNVME_ASSOC_HOSTID_LEN 64
+#define FCNVME_ASSOC_HOSTID_LEN 16
#define FCNVME_ASSOC_HOSTNQN_LEN 256
#define FCNVME_ASSOC_SUBNQN_LEN 256
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9061780b141ff..b625bacf37efa 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -245,6 +245,7 @@ enum {
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
+ NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
};
struct nvme_lbaf {
@@ -603,6 +604,7 @@ enum nvme_admin_opcode {
nvme_admin_download_fw = 0x11,
nvme_admin_ns_attach = 0x15,
nvme_admin_keep_alive = 0x18,
+ nvme_admin_dbbuf = 0x7C,
nvme_admin_format_nvm = 0x80,
nvme_admin_security_send = 0x81,
nvme_admin_security_recv = 0x82,
@@ -874,6 +876,16 @@ struct nvmf_property_get_command {
__u8 resv4[16];
};
+struct nvme_dbbuf {
+ __u8 opcode;
+ __u8 flags;
+ __u16 command_id;
+ __u32 rsvd1[5];
+ __le64 prp1;
+ __le64 prp2;
+ __u32 rsvd12[6];
+};
+
struct nvme_command {
union {
struct nvme_common_command common;
@@ -893,6 +905,7 @@ struct nvme_command {
struct nvmf_connect_command connect;
struct nvmf_property_set_command prop_set;
struct nvmf_property_get_command prop_get;
+ struct nvme_dbbuf dbbuf;
};
};
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index d4e0a204c118c..a1904aadbc450 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -176,6 +176,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth);
int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin);
/**
+ * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
+ * limiting the depth used from each word.
+ * @sb: Bitmap to allocate from.
+ * @alloc_hint: Hint for where to start searching for a free bit.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ *
+ * This rather specific operation allows for having multiple users with
+ * different allocation limits. E.g., there can be a high-priority class that
+ * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
+ * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
+ * class can only allocate half of the total bits in the bitmap, preventing it
+ * from starving out the high-priority class.
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+ unsigned long shallow_depth);
+
+/**
* sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
* @sb: Bitmap to check.
*
@@ -326,6 +345,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);
int __sbitmap_queue_get(struct sbitmap_queue *sbq);
/**
+ * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word, with preemption
+ * already disabled.
+ * @sbq: Bitmap queue to allocate from.
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int shallow_depth);
+
+/**
* sbitmap_queue_get() - Try to allocate a free bit from a &struct
* sbitmap_queue.
* @sbq: Bitmap queue to allocate from.
@@ -346,6 +378,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
}
/**
+ * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
+ * sbitmap_queue, limiting the depth used from each word.
+ * @sbq: Bitmap queue to allocate from.
+ * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
+ * sbitmap_queue_clear()).
+ * @shallow_depth: The maximum number of bits to allocate from a single word.
+ * See sbitmap_get_shallow().
+ *
+ * Return: Non-negative allocated bit number if successful, -1 otherwise.
+ */
+static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int *cpu,
+ unsigned int shallow_depth)
+{
+ int nr;
+
+ *cpu = get_cpu();
+ nr = __sbitmap_queue_get_shallow(sbq, shallow_depth);
+ put_cpu();
+ return nr;
+}
+
+/**
* sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
* &struct sbitmap_queue.
* @sbq: Bitmap to free from.
diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h
index 9fba9dd335440..9375d23a24e7a 100644
--- a/include/linux/t10-pi.h
+++ b/include/linux/t10-pi.h
@@ -34,9 +34,9 @@ struct t10_pi_tuple {
};
-extern struct blk_integrity_profile t10_pi_type1_crc;
-extern struct blk_integrity_profile t10_pi_type1_ip;
-extern struct blk_integrity_profile t10_pi_type3_crc;
-extern struct blk_integrity_profile t10_pi_type3_ip;
+extern const struct blk_integrity_profile t10_pi_type1_crc;
+extern const struct blk_integrity_profile t10_pi_type1_ip;
+extern const struct blk_integrity_profile t10_pi_type3_crc;
+extern const struct blk_integrity_profile t10_pi_type3_ip;
#endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a3c0cbd7c8882..d5815794416c9 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page)
static inline void inode_detach_wb(struct inode *inode)
{
if (inode->i_wb) {
+ WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
wb_put(inode->i_wb);
inode->i_wb = NULL;
}
diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h
index ba0aeb980f7e7..f0c76f9dc2854 100644
--- a/include/scsi/scsi_request.h
+++ b/include/scsi/scsi_request.h
@@ -9,8 +9,10 @@ struct scsi_request {
unsigned char __cmd[BLK_MAX_CDB];
unsigned char *cmd;
unsigned short cmd_len;
+ int result;
unsigned int sense_len;
unsigned int resid_len; /* residual count */
+ int retries;
void *sense;
};
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index a88ed13446ff8..d0dbe60d8a6dd 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -61,7 +61,16 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
TP_ARGS(bh)
);
-DECLARE_EVENT_CLASS(block_rq_with_error,
+/**
+ * block_rq_requeue - place block IO request back on a queue
+ * @q: queue holding operation
+ * @rq: block IO operation request
+ *
+ * The block operation request @rq is being placed back into queue
+ * @q. For some reason the request was not completed and needs to be
+ * put back in the queue.
+ */
+TRACE_EVENT(block_rq_requeue,
TP_PROTO(struct request_queue *q, struct request *rq),
@@ -71,7 +80,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
__field( dev_t, dev )
__field( sector_t, sector )
__field( unsigned int, nr_sector )
- __field( int, errors )
__array( char, rwbs, RWBS_LEN )
__dynamic_array( char, cmd, 1 )
),
@@ -80,7 +88,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
__entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
__entry->sector = blk_rq_trace_sector(rq);
__entry->nr_sector = blk_rq_trace_nr_sectors(rq);
- __entry->errors = rq->errors;
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
__get_str(cmd)[0] = '\0';
@@ -90,46 +97,13 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rwbs, __get_str(cmd),
(unsigned long long)__entry->sector,
- __entry->nr_sector, __entry->errors)
-);
-
-/**
- * block_rq_abort - abort block operation request
- * @q: queue containing the block operation request
- * @rq: block IO operation request
- *
- * Called immediately after pending block IO operation request @rq in
- * queue @q is aborted. The fields in the operation request @rq
- * can be examined to determine which device and sectors the pending
- * operation would access.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_abort,
-
- TP_PROTO(struct request_queue *q, struct request *rq),
-
- TP_ARGS(q, rq)
-);
-
-/**
- * block_rq_requeue - place block IO request back on a queue
- * @q: queue holding operation
- * @rq: block IO operation request
- *
- * The block operation request @rq is being placed back into queue
- * @q. For some reason the request was not completed and needs to be
- * put back in the queue.
- */
-DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
-
- TP_PROTO(struct request_queue *q, struct request *rq),
-
- TP_ARGS(q, rq)
+ __entry->nr_sector, 0)
);
/**
* block_rq_complete - block IO operation completed by device driver
- * @q: queue containing the block operation request
* @rq: block operations request
+ * @error: status code
* @nr_bytes: number of completed bytes
*
* The block_rq_complete tracepoint event indicates that some portion
@@ -140,16 +114,15 @@ DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
*/
TRACE_EVENT(block_rq_complete,
- TP_PROTO(struct request_queue *q, struct request *rq,
- unsigned int nr_bytes),
+ TP_PROTO(struct request *rq, int error, unsigned int nr_bytes),
- TP_ARGS(q, rq, nr_bytes),
+ TP_ARGS(rq, error, nr_bytes),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( sector_t, sector )
__field( unsigned int, nr_sector )
- __field( int, errors )
+ __field( int, error )
__array( char, rwbs, RWBS_LEN )
__dynamic_array( char, cmd, 1 )
),
@@ -158,7 +131,7 @@ TRACE_EVENT(block_rq_complete,
__entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
__entry->sector = blk_rq_pos(rq);
__entry->nr_sector = nr_bytes >> 9;
- __entry->errors = rq->errors;
+ __entry->error = error;
blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
__get_str(cmd)[0] = '\0';
@@ -168,7 +141,7 @@ TRACE_EVENT(block_rq_complete,
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->rwbs, __get_str(cmd),
(unsigned long long)__entry->sector,
- __entry->nr_sector, __entry->errors)
+ __entry->nr_sector, __entry->error)
);
DECLARE_EVENT_CLASS(block_rq,
diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h
index fd19f36b31292..c8aec4b9e73b8 100644
--- a/include/uapi/linux/lightnvm.h
+++ b/include/uapi/linux/lightnvm.h
@@ -85,6 +85,10 @@ struct nvm_ioctl_create_conf {
};
};
+enum {
+ NVM_TARGET_FACTORY = 1 << 0, /* Init target in factory mode */
+};
+
struct nvm_ioctl_create {
char dev[DISK_NAME_LEN]; /* open-channel SSD device */
char tgttype[NVM_TTYPE_NAME_MAX]; /* target type name */
diff --git a/include/uapi/linux/nbd-netlink.h b/include/uapi/linux/nbd-netlink.h
new file mode 100644
index 0000000000000..6f7ca3d63a653
--- /dev/null
+++ b/include/uapi/linux/nbd-netlink.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2017 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef _UAPILINUX_NBD_NETLINK_H
+#define _UAPILINUX_NBD_NETLINK_H
+
+#define NBD_GENL_FAMILY_NAME "nbd"
+#define NBD_GENL_VERSION 0x1
+#define NBD_GENL_MCAST_GROUP_NAME "nbd_mc_group"
+
+/* Configuration policy attributes, used for CONNECT */
+enum {
+ NBD_ATTR_UNSPEC,
+ NBD_ATTR_INDEX,
+ NBD_ATTR_SIZE_BYTES,
+ NBD_ATTR_BLOCK_SIZE_BYTES,
+ NBD_ATTR_TIMEOUT,
+ NBD_ATTR_SERVER_FLAGS,
+ NBD_ATTR_CLIENT_FLAGS,
+ NBD_ATTR_SOCKETS,
+ NBD_ATTR_DEAD_CONN_TIMEOUT,
+ NBD_ATTR_DEVICE_LIST,
+ __NBD_ATTR_MAX,
+};
+#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1)
+
+/*
+ * This is the format for multiple devices with NBD_ATTR_DEVICE_LIST
+ *
+ * [NBD_ATTR_DEVICE_LIST]
+ * [NBD_DEVICE_ITEM]
+ * [NBD_DEVICE_INDEX]
+ * [NBD_DEVICE_CONNECTED]
+ */
+enum {
+ NBD_DEVICE_ITEM_UNSPEC,
+ NBD_DEVICE_ITEM,
+ __NBD_DEVICE_ITEM_MAX,
+};
+#define NBD_DEVICE_ITEM_MAX (__NBD_DEVICE_ITEM_MAX - 1)
+
+enum {
+ NBD_DEVICE_UNSPEC,
+ NBD_DEVICE_INDEX,
+ NBD_DEVICE_CONNECTED,
+ __NBD_DEVICE_MAX,
+};
+#define NBD_DEVICE_ATTR_MAX (__NBD_DEVICE_MAX - 1)
+
+/*
+ * This is the format for multiple sockets with NBD_ATTR_SOCKETS
+ *
+ * [NBD_ATTR_SOCKETS]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ * [NBD_SOCK_ITEM]
+ * [NBD_SOCK_FD]
+ */
+enum {
+ NBD_SOCK_ITEM_UNSPEC,
+ NBD_SOCK_ITEM,
+ __NBD_SOCK_ITEM_MAX,
+};
+#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1)
+
+enum {
+ NBD_SOCK_UNSPEC,
+ NBD_SOCK_FD,
+ __NBD_SOCK_MAX,
+};
+#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1)
+
+enum {
+ NBD_CMD_UNSPEC,
+ NBD_CMD_CONNECT,
+ NBD_CMD_DISCONNECT,
+ NBD_CMD_RECONFIGURE,
+ NBD_CMD_LINK_DEAD,
+ NBD_CMD_STATUS,
+ __NBD_CMD_MAX,
+};
+#define NBD_CMD_MAX (__NBD_CMD_MAX - 1)
+
+#endif /* _UAPILINUX_NBD_NETLINK_H */
diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h
index c91c642ea9003..155e33f819134 100644
--- a/include/uapi/linux/nbd.h
+++ b/include/uapi/linux/nbd.h
@@ -37,7 +37,7 @@ enum {
NBD_CMD_TRIM = 4
};
-/* values for flags field */
+/* values for flags field, these are server interaction specific. */
#define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */
#define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */
#define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */
@@ -45,6 +45,10 @@ enum {
#define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */
#define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */
+/* These are client behavior specific flags. */
+#define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on
+ disconnect. */
+
/* userspace doesn't need the nbd_device structure */
/* These are sent over the network in the request/reply magic fields */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b2058a7f94bd8..bd8ae8d5ae9ca 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q)
/**
* blk_add_trace_rq - Add a trace for a request oriented action
- * @q: queue the io is for
* @rq: the source request
+ * @error: return status to log
* @nr_bytes: number of completed bytes
* @what: the action
*
@@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q)
* Records an action against a request. Will log the bio offset + size.
*
**/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt = rq->q->blk_trace;
if (likely(!bt))
return;
@@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
what |= BLK_TC_ACT(BLK_TC_FS);
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
- rq->cmd_flags, what, rq->errors, 0, NULL);
-}
-
-static void blk_add_trace_rq_abort(void *ignore,
- struct request_queue *q, struct request *rq)
-{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
+ rq->cmd_flags, what, error, 0, NULL);
}
static void blk_add_trace_rq_insert(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT);
}
static void blk_add_trace_rq_issue(void *ignore,
struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE);
}
static void blk_add_trace_rq_requeue(void *ignore,
struct request_queue *q,
struct request *rq)
{
- blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
+ blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE);
}
-static void blk_add_trace_rq_complete(void *ignore,
- struct request_queue *q,
- struct request *rq,
- unsigned int nr_bytes)
+static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
+ int error, unsigned int nr_bytes)
{
- blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
+ blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE);
}
/**
@@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
- rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors,
+ rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r);
}
@@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q,
return;
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
- BLK_TA_DRV_DATA, rq->errors, len, data);
+ BLK_TA_DRV_DATA, 0, len, data);
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -974,8 +966,6 @@ static void blk_register_tracepoints(void)
{
int ret;
- ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
- WARN_ON(ret);
ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
WARN_ON(ret);
ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
@@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
- unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
tracepoint_synchronize_unregister();
}
diff --git a/lib/kobject.c b/lib/kobject.c
index 445dcaeb0f56d..763d70a189410 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj)
}
EXPORT_SYMBOL(kobject_get);
-static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
+struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
+ if (!kobj)
+ return NULL;
if (!kref_get_unless_zero(&kobj->kref))
kobj = NULL;
return kobj;
}
+EXPORT_SYMBOL(kobject_get_unless_zero);
/*
* kobject_cleanup - free kobject resources.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 60e800e0b5a0d..80aa8d5463faf 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -79,15 +79,15 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
}
EXPORT_SYMBOL_GPL(sbitmap_resize);
-static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
- bool wrap)
+static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
+ unsigned int hint, bool wrap)
{
unsigned int orig_hint = hint;
int nr;
while (1) {
- nr = find_next_zero_bit(&word->word, word->depth, hint);
- if (unlikely(nr >= word->depth)) {
+ nr = find_next_zero_bit(word, depth, hint);
+ if (unlikely(nr >= depth)) {
/*
* We started with an offset, and we didn't reset the
* offset to 0 in a failure case, so start from 0 to
@@ -100,11 +100,11 @@ static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint,
return -1;
}
- if (!test_and_set_bit(nr, &word->word))
+ if (!test_and_set_bit(nr, word))
break;
hint = nr + 1;
- if (hint >= word->depth - 1)
+ if (hint >= depth - 1)
hint = 0;
}
@@ -119,7 +119,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
index = SB_NR_TO_INDEX(sb, alloc_hint);
for (i = 0; i < sb->map_nr; i++) {
- nr = __sbitmap_get_word(&sb->map[index],
+ nr = __sbitmap_get_word(&sb->map[index].word,
+ sb->map[index].depth,
SB_NR_TO_BIT(sb, alloc_hint),
!round_robin);
if (nr != -1) {
@@ -141,6 +142,37 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
}
EXPORT_SYMBOL_GPL(sbitmap_get);
+int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
+ unsigned long shallow_depth)
+{
+ unsigned int i, index;
+ int nr = -1;
+
+ index = SB_NR_TO_INDEX(sb, alloc_hint);
+
+ for (i = 0; i < sb->map_nr; i++) {
+ nr = __sbitmap_get_word(&sb->map[index].word,
+ min(sb->map[index].depth, shallow_depth),
+ SB_NR_TO_BIT(sb, alloc_hint), true);
+ if (nr != -1) {
+ nr += index << sb->shift;
+ break;
+ }
+
+ /* Jump to next index. */
+ index++;
+ alloc_hint = index << sb->shift;
+
+ if (index >= sb->map_nr) {
+ index = 0;
+ alloc_hint = 0;
+ }
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(sbitmap_get_shallow);
+
bool sbitmap_any_bit_set(const struct sbitmap *sb)
{
unsigned int i;
@@ -342,6 +374,35 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq)
}
EXPORT_SYMBOL_GPL(__sbitmap_queue_get);
+int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
+ unsigned int shallow_depth)
+{
+ unsigned int hint, depth;
+ int nr;
+
+ hint = this_cpu_read(*sbq->alloc_hint);
+ depth = READ_ONCE(sbq->sb.depth);
+ if (unlikely(hint >= depth)) {
+ hint = depth ? prandom_u32() % depth : 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+ nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth);
+
+ if (nr == -1) {
+ /* If the map is full, a hint won't do us much good. */
+ this_cpu_write(*sbq->alloc_hint, 0);
+ } else if (nr == hint || unlikely(sbq->round_robin)) {
+ /* Only update the hint if we used it. */
+ hint = nr + 1;
+ if (hint >= depth - 1)
+ hint = 0;
+ this_cpu_write(*sbq->alloc_hint, hint);
+ }
+
+ return nr;
+}
+EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow);
+
static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
{
int i, wake_index;
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c20..f028a9a472fd9 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
#include <linux/device.h>
#include <trace/events/writeback.h>
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
}
postcore_initcall(bdi_class_init);
+static int bdi_init(struct backing_dev_info *bdi);
+
static int __init default_bdi_init(void)
{
int err;
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
+ if (wb != &bdi->wb)
+ bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested)
- return -ENOMEM;
+ if (!wb->congested) {
+ err = -ENOMEM;
+ goto out_put_bdi;
+ }
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -335,9 +339,14 @@ out_destroy_stat:
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
wb_congested_put(wb->congested);
+out_put_bdi:
+ if (wb != &bdi->wb)
+ bdi_put(bdi);
return err;
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
+ /*
+ * Wait for wb shutdown to finish if someone else is just
+ * running wb_shutdown(). Otherwise we could proceed to wb /
+ * bdi destruction before wb_shutdown() is finished.
+ */
+ wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return;
}
+ set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock);
+ cgwb_remove_from_bdi_list(wb);
/*
* Drain work list and shutdown the delayed_work. !WB_registered
* tells wb_workfn() that @wb is dying and its work_list needs to
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ /*
+ * Make sure bit gets cleared after shutdown is finished. Matches with
+ * the barrier provided by test_and_clear_bit() above.
+ */
+ smp_wmb();
+ clear_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb)
fprop_local_destroy_percpu(&wb->completions);
wb_congested_put(wb->congested);
+ if (wb != &wb->bdi->wb)
+ bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
/*
* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected. cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
/**
* wb_congested_get_create - get or create a wb_congested
@@ -438,7 +461,7 @@ retry:
return NULL;
atomic_set(&new_congested->refcnt, 0);
- new_congested->bdi = bdi;
+ new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
}
/* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->bdi) {
+ if (congested->__bdi) {
rb_erase(&congested->rb_node,
- &congested->bdi->cgwb_congested_tree);
- congested->bdi = NULL;
+ &congested->__bdi->cgwb_congested_tree);
+ congested->__bdi = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct backing_dev_info *bdi = wb->bdi;
-
- spin_lock_irq(&cgwb_lock);
- list_del_rcu(&wb->bdi_node);
- spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
kfree_rcu(wb, rcu);
-
- if (atomic_dec_and_test(&bdi->usage_cnt))
- wake_up_all(&cgwb_release_wait);
}
static void cgwb_release(struct percpu_ref *refcnt)
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb)
percpu_ref_kill(&wb->refcnt);
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+}
+
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
- atomic_inc(&bdi->usage_cnt);
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
- atomic_set(&bdi->usage_cnt, 1);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return ret;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
+ struct bdi_writeback *wb;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
- spin_unlock_irq(&cgwb_lock);
- /*
- * All cgwb's must be shutdown and released before returning. Drain
- * the usage counter to wait for all cgwb's ever created on @bdi.
- */
- atomic_dec(&bdi->usage_cnt);
- wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
- /*
- * Grab back our reference so that we hold it when @bdi gets
- * re-registered.
- */
- atomic_inc(&bdi->usage_cnt);
+ while (!list_empty(&bdi->wb_list)) {
+ wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+ bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+ wb_shutdown(wb);
+ spin_lock_irq(&cgwb_lock);
+ }
+ spin_unlock_irq(&cgwb_lock);
}
/**
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
rb_entry(rbn, struct bdi_writeback_congested, rb_node);
rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
+ congested->__bdi = NULL; /* mark @congested unlinked */
}
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
static void cgwb_bdi_exit(struct backing_dev_info *bdi)
{
wb_congested_put(bdi->wb_congested);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ list_del_rcu(&wb->bdi_node);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
{
int ret;
@@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi)
ret = cgwb_bdi_init(bdi);
- list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
-
return ret;
}
-EXPORT_SYMBOL(bdi_init);
struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
{
@@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
return bdi;
}
+EXPORT_SYMBOL(bdi_alloc_node);
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
- va_list args;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
+ dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
if (IS_ERR(dev))
return PTR_ERR(dev);
+ cgwb_bdi_register(bdi);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
@@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
trace_writeback_bdi_register(bdi);
return 0;
}
-EXPORT_SYMBOL(bdi_register);
+EXPORT_SYMBOL(bdi_register_va);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
- return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ return ret;
}
-EXPORT_SYMBOL(bdi_register_dev);
+EXPORT_SYMBOL(bdi_register);
int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
{
int rc;
- rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
- MINOR(owner->devt));
+ rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
if (rc)
return rc;
/* Leaking owner reference... */
@@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
- cgwb_bdi_destroy(bdi);
+ cgwb_bdi_unregister(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
@@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-static void bdi_exit(struct backing_dev_info *bdi)
-{
- WARN_ON_ONCE(bdi->dev);
- wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
-}
-
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
- bdi_exit(bdi);
+ if (test_bit(WB_registered, &bdi->wb.state))
+ bdi_unregister(bdi);
+ WARN_ON_ONCE(bdi->dev);
+ wb_exit(&bdi->wb);
+ cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi)
{
kref_put(&bdi->refcnt, release_bdi);
}
-
-void bdi_destroy(struct backing_dev_info *bdi)
-{
- bdi_unregister(bdi);
- bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
- int err;
-
- bdi->name = name;
- bdi->capabilities = 0;
- err = bdi_init(bdi);
- if (err)
- return err;
-
- err = bdi_register(bdi, NULL, "%.28s-%ld", name,
- atomic_long_inc_return(&bdi_seq));
- if (err) {
- bdi_destroy(bdi);
- return err;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
+EXPORT_SYMBOL(bdi_put);
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
diff --git a/net/core/utils.c b/net/core/utils.c
index 6592d7bbed394..32c467cf52d65 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -26,9 +26,11 @@
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/ratelimit.h>
+#include <linux/socket.h>
#include <net/sock.h>
#include <net/net_ratelimit.h>
+#include <net/ipv6.h>
#include <asm/byteorder.h>
#include <linux/uaccess.h>
@@ -300,6 +302,107 @@ out:
}
EXPORT_SYMBOL(in6_pton);
+static int inet4_pton(const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
+ int srclen = strlen(src);
+
+ if (srclen > INET_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr,
+ '\n', NULL) == 0)
+ return -EINVAL;
+
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(port_num);
+
+ return 0;
+}
+
+static int inet6_pton(struct net *net, const char *src, u16 port_num,
+ struct sockaddr_storage *addr)
+{
+ struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
+ const char *scope_delim;
+ int srclen = strlen(src);
+
+ if (srclen > INET6_ADDRSTRLEN)
+ return -EINVAL;
+
+ if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr,
+ '%', &scope_delim) == 0)
+ return -EINVAL;
+
+ if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL &&
+ src + srclen != scope_delim && *scope_delim == '%') {
+ struct net_device *dev;
+ char scope_id[16];
+ size_t scope_len = min_t(size_t, sizeof(scope_id) - 1,
+ src + srclen - scope_delim - 1);
+
+ memcpy(scope_id, scope_delim + 1, scope_len);
+ scope_id[scope_len] = '\0';
+
+ dev = dev_get_by_name(net, scope_id);
+ if (dev) {
+ addr6->sin6_scope_id = dev->ifindex;
+ dev_put(dev);
+ } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) {
+ return -EINVAL;
+ }
+ }
+
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(port_num);
+
+ return 0;
+}
+
+/**
+ * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address
+ * @net: net namespace (used for scope handling)
+ * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either
+ * @src: the start of the address string
+ * @port: the start of the port string (or NULL for none)
+ * @addr: output socket address
+ *
+ * Return zero on success, return errno when any error occurs.
+ */
+int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
+ const char *src, const char *port, struct sockaddr_storage *addr)
+{
+ u16 port_num;
+ int ret = -EINVAL;
+
+ if (port) {
+ if (kstrtou16(port, 0, &port_num))
+ return -EINVAL;
+ } else {
+ port_num = 0;
+ }
+
+ switch (af) {
+ case AF_INET:
+ ret = inet4_pton(src, port_num, addr);
+ break;
+ case AF_INET6:
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ case AF_UNSPEC:
+ ret = inet4_pton(src, port_num, addr);
+ if (ret)
+ ret = inet6_pton(net, src, port_num, addr);
+ break;
+ default:
+ pr_err("unexpected address family %d\n", af);
+ };
+
+ return ret;
+}
+EXPORT_SYMBOL(inet_pton_with_scope);
+
void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
__be32 from, __be32 to, bool pseudohdr)
{